# Text Classification

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

## Read pre-processed data

In [48]:
import csv

data = pd.read_csv('train.csv')
df_eval = pd.read_csv('validation.csv')
df_test = pd.read_csv('test.csv')

x_train, y_train = data['cleaned_text'].values.tolist(), data['label_id'].values.tolist()
x_valid, y_valid = df_eval['cleaned_text'].values.tolist(), df_eval['label_id'].values.tolist()
x_test, y_test = df_test['cleaned_text'].values.tolist(), df_test['label_id'].values.tolist()

print(len(x_train))
print(len(x_valid))
print(len(x_test))

966
75
74


## encode labels

In [31]:
le = LabelEncoder()
Y = le.fit_transform(y_train)
Y = Y.reshape(-1,1)

Y_val = le.fit_transform(y_valid)
Y_val = Y_val.reshape(-1,1)

Y_test = le.fit_transform(y_test)
Y_test = Y_test.reshape(-1,1)

## Tokenize data
#### assign a number to each word, set a 64 number of words for each text and pad with 0 in the case texts with lenght les than 64

In [49]:
max_words = 14727
max_len = 64
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(x_train)
sequences = tok.texts_to_sequences(x_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

test_sequences = tok.texts_to_sequences(x_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

valid_sequences = tok.texts_to_sequences(x_valid)
valid_sequences_matrix = sequence.pad_sequences(valid_sequences,maxlen=max_len)

## MODEL
#### RNN model containing an embedding layer, LSTM layer and ...

In [51]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,64,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(64,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [53]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 64)]              0         
                                                                 
 embedding_5 (Embedding)     (None, 64, 64)            942528    
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 FC1 (Dense)                 (None, 64)                4160      
                                                                 
 activation_10 (Activation)  (None, 64)                0         
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 out_layer (Dense)           (None, 1)                 65  

## Fit model

In [54]:
BATCH_SIZE = 4

rain_steps = (966) // BATCH_SIZE
valid_steps = (74) // BATCH_SIZE

model.fit(sequences_matrix,Y,batch_size=BATCH_SIZE,epochs=2, validation_steps=valid_steps,
          validation_data=(valid_sequences_matrix, Y_val), callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fd27bab9350>

In [55]:
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.630
  Accuracy: 0.716


## Results and Confusion matrixx

In [56]:
from sklearn.metrics import confusion_matrix, classification_report

predict_x = model.predict(test_sequences_matrix) 
#predictions = np.argmax(predict_x, axis=1)
predictions = predict_x >0.5
print(sum(predict_x > 0.5))

labels = ['negative', 'positive']
print(classification_report(Y_test, predictions))
pd.DataFrame(confusion_matrix(Y_test, predictions), index=labels, columns=labels)

[17]
              precision    recall  f1-score   support

           0       0.79      0.83      0.81        54
           1       0.47      0.40      0.43        20

    accuracy                           0.72        74
   macro avg       0.63      0.62      0.62        74
weighted avg       0.70      0.72      0.71        74



Unnamed: 0,negative,positive
negative,45,9
positive,12,8
