In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('p3_train.csv')
test = pd.read_csv('p3_test.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D

# tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_length = max([len(seq) for seq in X_train_seq])
max_length = max([len(seq) for seq in X_test_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=max_length))
model.add(Conv1D(64, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Conv1D(32, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=3))
model.add(Dense(32, activation='tanh'))
model.add(GlobalMaxPooling1D())
model.add(Dense(11, activation='softmax'))  # 2 classes: generated by same method or not
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
model.fit(X_train_padded, y_train, epochs=15, batch_size=32)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f6bf0167c10>

In [5]:
tokenizer.fit_on_texts(X_test)
loss, acc = model.evaluate(X_test_padded, y_test)



In [6]:
preds = model.predict(X_test_padded)



In [7]:
preds[0]

array([2.8556437e-04, 9.9842405e-01, 7.7309691e-05, 4.8316002e-04,
       9.0253081e-05, 1.5668813e-04, 3.3997372e-04, 1.5556836e-05,
       1.8358913e-05, 7.4950804e-06, 1.0159880e-04], dtype=float32)

In [8]:
preds_new = np.argmax(preds, axis = 1)

In [9]:
preds_new = np.array(preds_new)

In [10]:
preds_new[0]

1

In [11]:
from sklearn.metrics import f1_score,precision_score, recall_score, accuracy_score
f1 = f1_score(y_test, preds_new, average='macro')
print("F1 score = ",f1)
precision = precision_score(y_test, preds_new, average='macro')
recall = recall_score(y_test, preds_new, average='macro')
print("Precision = ", precision)
print("Recall = ", recall)
print("Accuracy = ", accuracy_score(y_test, preds_new))

F1 score =  0.7025498683136416
Precision =  0.720090965335583
Recall =  0.699286615345787
Accuracy =  0.6991570073761855


In [12]:
from sklearn.metrics import classification_report, roc_curve, auc, f1_score, accuracy_score, confusion_matrix
matrix = confusion_matrix(y_test, preds_new, labels = [0, 1, 2])
mat = matrix.diagonal()/matrix.sum(axis=1)
print(classification_report(y_test, preds_new, labels = [0, 1, 2],digits=4))
print('confusion matrix: ', mat)

              precision    recall  f1-score   support

           0     0.9737    0.8916    0.9308       166
           1     0.9465    0.9888    0.9672       179
           2     0.4248    0.3757    0.3988       173

   micro avg     0.7927    0.7529    0.7723       518
   macro avg     0.7817    0.7520    0.7656       518
weighted avg     0.7810    0.7529    0.7657       518

confusion matrix:  [0.97368421 0.99438202 0.97014925]
