In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('p3_train.csv')
test = pd.read_csv('p3_test.csv')

In [3]:
!git clone https://github.com/facebookresearch/fastText.git
!cd fastText
!pip install fastText

In [4]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English
# ft = fasttext.load_model('cc.en.300.bin')

In [5]:
from gensim.models.fasttext import load_facebook_model

# Load FastText embeddings
ft_model = load_facebook_model('cc.en.300.bin')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, GlobalMaxPooling1D, Attention
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_length = max([len(seq) for seq in X_train_seq])
max_length = max([len(seq) for seq in X_test_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')
print(max_length)

In [None]:
embedding_dim = 300
# LSTM model with fasttext
model = Sequential()
model.add(Embedding(input_dim=len(ft_model.wv.key_to_index)+1,
                    output_dim=embedding_dim,
                    input_length=max_length,
                    weights=[np.vstack((np.zeros((1, embedding_dim)), ft_model.wv.vectors))],
                    trainable=False))
model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=3))
model.add(Dense(32, activation='tanh'))
model.add(GlobalMaxPooling1D())
model.add(Dense(11, activation='softmax'))  # 2 classes: generated by same method or not
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
model.fit(X_train_padded, y_train, epochs=15, batch_size=32)

In [45]:
# tokenizer.fit_on_texts(X_test)
loss, acc = model.evaluate(X_test_padded, y_test)



In [21]:
preds = model.predict(X_test_padded)



In [22]:
preds[0]

array([4.5005148e-05, 9.9849534e-01, 1.8817246e-04, 1.4493134e-04,
       5.7976287e-08, 3.8147229e-04, 3.6095087e-05, 3.7908198e-06,
       6.6085398e-04, 3.3609940e-05, 1.0793862e-05], dtype=float32)

In [23]:
preds_new = np.argmax(preds, axis = 1)

In [24]:
preds_new = np.array(preds_new)

In [25]:
preds_new[0]

1

In [26]:
from sklearn.metrics import f1_score,precision_score, recall_score, accuracy_score
f1 = f1_score(y_test, preds_new, average='macro')
print("F1 score = ",f1)
precision = precision_score(y_test, preds_new, average='macro')
recall = recall_score(y_test, preds_new, average='macro')
print("Precision = ", precision)
print("Recall = ", recall)
print("Accuracy = ", accuracy_score(y_test, preds_new))

F1 score =  0.6644910986126216
Precision =  0.6723619132103483
Recall =  0.664218184938255
Accuracy =  0.6659641728134879
