In [None]:
import random
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Flatten, MaxPooling1D, GRU, SpatialDropout1D, Bidirectional
from keras.optimizers import Adam

seed = 0
max_features = 10000
max_len = 125

random.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

In [None]:
# read .tsv file
train = pd.read_csv("your-train-csv-path", sep="\t")
test = pd.read_csv("your-test-csv-path", sep="\t")
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [None]:
# check row and columns
train.shape

(156060, 4)

In [None]:
# check data types
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [None]:
train['Phrase'][0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [None]:
train["Sentiment"].value_counts()
train['Sentiment']

0         1
1         2
2         2
3         2
4         2
         ..
156055    2
156056    1
156057    3
156058    2
156059    2
Name: Sentiment, Length: 156060, dtype: int64

In [None]:
x = train['Phrase'].apply(lambda train: train.lower())
x_test = test['Phrase'].apply(lambda test: test.lower())

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x)
x = tokenizer.texts_to_sequences(x)
# x

In [None]:
x = pad_sequences(x, maxlen=max_len)
x

array([[   0,    0,    0, ...,    3,    2,   42],
       [   0,    0,    0, ...,   13,    1, 2976],
       [   0,    0,    0, ...,    0,    2,  323],
       ...,
       [   0,    0,    0, ...,    0, 9376, 9377],
       [   0,    0,    0, ...,    0,    0, 9376],
       [   0,    0,    0, ...,    0,    0, 9377]], dtype=int32)

In [None]:
y = to_categorical(train['Sentiment'])
y

array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [None]:
x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=max_len)
x_test

array([[   0,    0,    0, ...,  614, 1024,  392],
       [   0,    0,    0, ...,  614, 1024,  392],
       [   0,    0,    0, ...,    0,    0,   16],
       ...,
       [   0,    0,    0, ...,    2,  126, 5916],
       [   0,    0,    0, ...,    2,  126, 5916],
       [   0,    0,    0, ...,    0,  373, 2014]], dtype=int32)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=seed)

In [None]:
x_train.shape

(117045, 125)

In [None]:
x_val.shape

(39015, 125)

In [None]:
model = Sequential()

model.add(Embedding(max_features, 100, mask_zero=True))
model.add(LSTM(128, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))
model.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))
model.add(LSTM(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=False))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 lstm (LSTM)                 (None, None, 128)         117248    
                                                                 
 lstm_1 (LSTM)               (None, None, 64)          49408     
                                                                 
 lstm_2 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 5)                 165       
                                                                 
Total params: 1,179,237
Trainable params: 1,179,237
Non-trainable params: 0
_________________________________________________________________


In [None]:
epochs = 5
batch_size = 2048 

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3d34be0100>

In [None]:
sample_submission = pd.read_csv('your-submission.csv')
sample_submission['Sentiment'] = model.predict(x_test, batch_size=batch_size, verbose=1)
sample_submission.to_csv('mrsa_lstm.csv', index=False)