In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('p2_train.csv')
test = pd.read_csv('p2_test.csv')

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,7744,longlost pictur archiv celebr glamour seneg wr...,0
1,10536,advanc understand natur world respons current ...,1
2,7043,itali test western economi bear almost total s...,1
3,5930,south africa box granni juke jab way healthier...,1
4,11660,coronaviru pandem nation tragedi hundr thousan...,1


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)

In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, LSTM, Dense, Dropout

# tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_length = max([len(seq) for seq in X_train_seq])
max_length = max([len(seq) for seq in X_test_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=max_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='sigmoid'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32)


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 5985, 32)          714880    
                                                                 
 bidirectional_10 (Bidirecti  (None, 5985, 128)        49664     
 onal)                                                           
                                                                 
 dropout_10 (Dropout)        (None, 5985, 128)         0         
                                                                 
 bidirectional_11 (Bidirecti  (None, 64)               41216     
 onal)                                                           
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
                                                                 
 dense_10 (Dense)            (None, 32)               

<keras.callbacks.History at 0x7f41fe6e92a0>

In [27]:
tokenizer.fit_on_texts(X_test)

In [28]:
loss, acc = model.evaluate(X_test_padded, y_test)



In [29]:
preds = model.predict(X_test_padded)
# preds = np.argmax(preds, axis=-1)
preds[0]



array([0.02182276, 0.97621447], dtype=float32)

In [30]:
preds_new = []
for i in preds:
  if(i[0]>i[1]):
    preds_new.append(0.0)
  else:
    preds_new.append(1.0)

In [31]:
y_test_array = np.array(y_test)
y_test_array[0]

1

In [32]:
from sklearn.metrics import f1_score,precision_score, recall_score, accuracy_score
f1 = f1_score(y_test, preds_new)
print("F1 score = ",f1)
precision = precision_score(y_test, preds_new)
recall = recall_score(y_test, preds_new)
print("Precision = ", precision)
print("Recall = ", recall)
print("Accuracy = ", accuracy_score(y_test, preds_new))

F1 score =  0.7828571428571428
Precision =  0.7784090909090909
Recall =  0.7873563218390804
Accuracy =  0.7771260997067448
