In [1]:
# Load initial libraries

import pandas as pd
import numpy as np
import csv

In [36]:
# Load data and reanme columns

train = pd.read_csv("/content/drive/MyDrive/train-v2.tsv", sep="\t", header=None, quoting = csv.QUOTE_NONE)
train.rename(columns={0:"label", 1:"sentence"}, inplace=True)

In [37]:
# Load test and reanme columns

test = pd.read_csv('/content/drive/MyDrive/test.tsv', sep="\t", header=None, quoting = csv.QUOTE_NONE)
test.rename(columns={0:"label", 1:"sentence"}, inplace=True)

In [None]:
# Splitting Xs and ys
sentences_test = test['sentence'].values
y_test = test['label'].values

sentences_train = train['sentence'].values
y_train = train['label'].values

In [38]:
# Vectorization

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=2, lowercase=False, ngram_range=(1,2))
vectorizer.fit(sentences_train)

CountVectorizer(lowercase=False, min_df=2, ngram_range=(1, 2))

In [41]:
# Text tokenize to get vocabulary

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)
from keras.preprocessing.sequence import pad_sequences
maxlen = 100
X_train_pad = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test, padding='post', maxlen=maxlen)
X_train_pad

array([[   1,    1,    4, ...,    0,    0,    0],
       [ 659,  332,   28, ...,    0,    0,    0],
       [   1, 3651,   11, ...,    0,    0,    0],
       ...,
       [  41, 1763,   47, ...,    0,    0,    0],
       [   1, 2154,   35, ...,    0,    0,    0],
       [   1, 1395, 3648, ...,    0,    0,    0]], dtype=int32)

In [42]:
# Building deep neural network

from keras.models import Sequential
from keras.layers import Embedding, Dropout
from keras import layers
embedding_dim = 50 # embedding size
vocab_size = len(tokenizer.word_index) + 1 # addition value 0 for padding
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen, embeddings_initializer=None)) 
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 100, 50)           3514750   
                                                                 
 global_max_pooling1d_4 (Glo  (None, 50)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_11 (Dense)            (None, 100)               5100      
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_12 (Dense)            (None, 10)                1010      
                                                                 
 dense_13 (Dense)            (None, 1)                 11        
                                                      

In [43]:
# Fit data on train

history = model.fit(X_train_pad, y_train, epochs=20, verbose=True, validation_split=0.1, batch_size=10)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [44]:
#Evaluate the model on the training and testing set

loss, accuracy = model.evaluate(X_train_pad, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))

Training Accuracy: 0.9516
Testing Accuracy: 0.8322


In [14]:
# Saving the model

from tensorflow.keras.models import load_model
model.save('SA_anything goes.h5')
model = load_model('/content/SA_anything goes.h5')