In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('p3_train.csv')
test = pd.read_csv('p3_test.csv')

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, LSTM, Dense, Dropout

# create a dataset with 1020 rows of T1 and T2
# T1 = ["The cat is sitting on the mat", "The dog is chasing the cat", "The bird is flying in the sky"] # list of T1 texts
# T2 = ["A feline is seated on the mat", "A canine is pursuing the feline", "A feathered creature is soaring in the heavens"] # list of T2 texts
# labels = np.array([1,1,0])

# split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)

# tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_length = max([len(seq) for seq in X_train_seq])
max_length = max([len(seq) for seq in X_test_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=max_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dense(11, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# train the model
model.fit(X_train_padded, y_train, epochs=10, batch_size=32)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 5777, 32)          1406304   
                                                                 
 bidirectional_2 (Bidirectio  (None, 5777, 128)        49664     
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 5777, 128)         0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 32)               

<keras.callbacks.History at 0x7fe1f01b3dc0>

In [6]:
tokenizer.fit_on_texts(X_test)
loss, acc = model.evaluate(X_test_padded, y_test)



In [7]:
preds = model.predict(X_test_padded)



In [8]:
preds_new = np.argmax(preds, axis = 1)

In [9]:
preds_new = np.array(preds_new)

In [10]:
from sklearn.metrics import f1_score,precision_score, recall_score, accuracy_score
f1 = f1_score(y_test, preds_new, average='macro')
print("F1 score = ",f1)
precision = precision_score(y_test, preds_new, average='macro')
recall = recall_score(y_test, preds_new, average='macro')
print("Precision = ", precision)
print("Recall = ", recall)
print("Accuracy = ", accuracy_score(y_test, preds_new))

F1 score =  0.5424972325968332
Precision =  0.5610062033439046
Recall =  0.5386181447666818
Accuracy =  0.541095890410959
