In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('p2_train.csv')
test = pd.read_csv('p2_test.csv')

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,7744,longlost pictur archiv celebr glamour seneg wr...,0
1,10536,advanc understand natur world respons current ...,1
2,7043,itali test western economi bear almost total s...,1
3,5930,south africa box granni juke jab way healthier...,1
4,11660,coronaviru pandem nation tragedi hundr thousan...,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, GlobalMaxPooling1D, Attention
from tensorflow.keras.callbacks import EarlyStopping

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [10]:
X_train_tagged_texts = []
for text in X_train:
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    X_train_tagged_texts.append(tags)
X_test_tagged_texts = []
for text in X_test:
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    X_test_tagged_texts.append(tags)
# Map POS tags to integers using NLTK's tagset
tagset = {"CC": 1, "CD": 2, "DT": 3, "EX": 4, "FW": 5, "IN": 6, "JJ": 7, "JJR": 8, "JJS": 9, "LS": 10, "MD": 11,
          "NN": 12, "NNS": 13, "NNP": 14, "NNPS": 15, "PDT": 16, "POS": 17, "PRP": 18, "PRP$": 19, "RB": 20,
          "RBR": 21, "RBS": 22, "RP": 23, "SYM": 24, "TO": 25, "UH": 26, "VB": 27, "VBD": 28, "VBG": 29,
          "VBN": 30, "VBP": 31, "VBZ": 32, "WDT": 33, "WP": 34, "WP$": 35, "WRB": 36}
X_train_tagged = []
for tagged_text in X_train_tagged_texts:
    tagged_text_int = []
    for word, tag in tagged_text:
        if tag in tagset:
            tagged_text_int.append(tagset[tag])
    X_train_tagged.append(tagged_text_int)

X_test_tagged = []
for tagged_text in X_test_tagged_texts:
    tagged_text_int = []
    for word, tag in tagged_text:
        if tag in tagset:
            tagged_text_int.append(tagset[tag])
    X_test_tagged.append(tagged_text_int)


max_length = max([len(seq) for seq in X_train_tagged])
max_length = max([len(seq) for seq in X_test_tagged])
# Pad sequences to a fixed length
# max_length = 12270
X_train_padded = pad_sequences(X_train_tagged, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_tagged, maxlen=max_length, padding='post')

In [45]:

# LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tagset)+1, output_dim=300, input_length=max_length))
model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=5))
model.add(Dense(64, activation='tanh'))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation='softmax'))  # 2 classes: generated by same method or not
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# early_stop = EarlyStopping(monitor='loss', patience=3, verbose=1)
# train the model
model.fit(X_train_padded, y_train, epochs=15, batch_size=32)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f2fdc7e4d60>

In [53]:
# tokenizer.fit_on_texts(X_test)
loss, acc = model.evaluate(X_test_padded, y_test)



In [54]:
preds = model.predict(X_test_padded)



In [55]:
preds[0]

array([0.0046443, 0.9953557], dtype=float32)

In [56]:
preds_new = []
for i in preds:
  if(i[0]>i[1]):
    preds_new.append(0.0)
  else:
    preds_new.append(1.0)

In [57]:
preds_new[1]

1.0

In [58]:
y_test_array = np.array(y_test)
y_test_array[1]

1

In [59]:
from sklearn.metrics import f1_score,precision_score, recall_score, accuracy_score
f1 = f1_score(y_test, preds_new)
print("F1 score = ",f1)
precision = precision_score(y_test, preds_new)
recall = recall_score(y_test, preds_new)
print("Precision = ", precision)
print("Recall = ", recall)
print("Accuracy = ", accuracy_score(y_test, preds_new))

F1 score =  0.8187134502923976
Precision =  0.8333333333333334
Recall =  0.8045977011494253
Accuracy =  0.8181818181818182
