In [1]:
from platform import python_version

print(python_version())

3.7.6


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import balanced_accuracy_score

In [3]:
import gensim
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [4]:
import re

In [5]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")  # Punkt Sentence Tokenizer
nltk.download("averaged_perceptron_tagger")  # Part of Speech Tagger
nltk.download("wordnet")  # a lexical database of English; useful for synonyms, hyponyms, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [7]:
import random

In [8]:
stop = set(stopwords.words('english'))
random.sample(stopwords.words('english'), 10)

['after', 'himself', "you'll", 'of', 't', 'haven', 'won', 'the', 'to', 'is']

# Loading Data

In [9]:
data_dev = pd.read_csv('../data/CT-FAN/Task3_english_dev.csv')
data_train = pd.read_csv('../data/CT-FAN/Task3_english_training.csv')
data_test = pd.read_csv('../data/CT-FAN/English_data_test_release_with_rating.csv')
data_concat = pd.concat([data_train, data_dev])
data_concat.rename(columns={'our rating':'label'}, inplace=True)
data_test.rename(columns={'our rating':'label'}, inplace=True)
data_concat['label'] = data_concat['label'].apply(lambda x: x.lower())
data_test['label'] = data_test['label'].apply(lambda x: x.lower())

train_dataset = data_concat
test_dataset = data_test

In [10]:
train_dataset.head()

Unnamed: 0,public_id,text,title,label
0,5a228e0e,Distracted driving causes more deaths in Canad...,"You Can Be Fined $1,500 If Your Passenger Is U...",false
1,30c605a1,Missouri politicians have made statements afte...,Missouri lawmakers condemn Las Vegas shooting,partially false
2,c3dea290,Home Alone 2: Lost in New York is full of viol...,CBC Cuts Donald Trump's 'Home Alone 2' Cameo O...,partially false
3,f14e8eb6,But things took a turn for the worse when riot...,Obama’s Daughters Caught on Camera Burning US ...,false
4,faf024d6,It’s no secret that Epstein and Schiff share a...,Leaked Visitor Logs Reveal Schiff’s 78 Visits ...,false


# Data Preprocessing and Data Preparation

In [11]:
train_dataset = train_dataset.fillna("null data")
test_dataset = test_dataset.fillna("null data")

### Removing stopwords

In [12]:
stop_words = stopwords.words('english')

In [13]:
if_stopwords = True

In [14]:
if if_stopwords:
    train_dataset["title"] = train_dataset["title"].str.lower().str.replace("’", "'")
    train_dataset["title"] = train_dataset["title"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    train_dataset["text"] = train_dataset["text"].str.lower().str.replace("’", "'")
    train_dataset["text"] = train_dataset["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    test_dataset["title"] = test_dataset["title"].str.lower().str.replace("’", "'")
    test_dataset["title"] = test_dataset["title"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    test_dataset["text"] = test_dataset["text"].str.lower().str.replace("’", "'")
    test_dataset["text"] = test_dataset["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [15]:
train_dataset.head()

Unnamed: 0,public_id,text,title,label
0,5a228e0e,distracted driving causes deaths canada impair...,"fined $1,500 passenger using mobile phone, sta...",false
1,30c605a1,missouri politicians made statements mass shoo...,missouri lawmakers condemn las vegas shooting,partially false
2,c3dea290,home alone 2: lost new york full violence that...,cbc cuts donald trump's 'home alone 2' cameo b...,partially false
3,f14e8eb6,things took turn worse riot police fired tear ...,obama's daughters caught camera burning us fla...,false
4,faf024d6,secret epstein schiff share long history perve...,leaked visitor logs reveal schiff's 78 visits ...,false


### Preprocessing

In [16]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', '', txt)
}

In [17]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [18]:
STOP_WORDS = [preprocessing_text_fn["no_punctuation"](word) for word in stop_words]
random.sample(stop_words, 20)

['hers',
 'with',
 'then',
 'further',
 'or',
 'they',
 'those',
 'why',
 'if',
 'shan',
 'of',
 'yours',
 'her',
 'haven',
 's',
 "it's",
 "mustn't",
 'shouldn',
 'up',
 'on']

In [19]:
def tokenize_without_stopwords(text, stop_words=STOP_WORDS):
    word_tokens = word_tokenize(text)
    filtered_sequence = [word for word in word_tokens if not word.lower() in stop_words]
    return filtered_sequence

In [20]:
train_dataset["title"] = train_dataset["title"].apply(preprocess_text)
train_dataset["text"] = train_dataset["text"].apply(preprocess_text)
train_dataset.head(10)

Unnamed: 0,public_id,text,title,label
0,5a228e0e,distracted driving causes deaths canada impair...,fined passenger using mobile phone starting ne...,false
1,30c605a1,missouri politicians made statements mass shoo...,missouri lawmakers condemn las vegas shooting,partially false
2,c3dea290,home alone lost new york full violence that op...,cbc cuts donald trumps home alone cameo broadcast,partially false
3,f14e8eb6,things took turn worse riot police fired tear ...,obamas daughters caught camera burning us flag...,false
4,faf024d6,secret epstein schiff share long history perve...,leaked visitor logs reveal schiffs visits epst...,false
5,c03ed5db,nation updated pmbgovernor secretary state geo...,km governor secretary state georgia took money...,other
6,61bd9a69,november us food drug administration fda publi...,fda shocking study cells used vaccines contami...,false
7,bb1999cc,trump confirms bombing accident immediately sc...,israel hits beirut nuclear missile trump leban...,false
8,c1dc1ac6,show antiamerican sentiment surprise precisely...,obamas daughters caught camera burning us flag...,false
9,f2182a54,fema camps portable human cages realall it scr...,fields human cages discovered caruthers califo...,false


In [21]:
test_dataset["title"] = test_dataset["title"].apply(preprocess_text)
test_dataset["text"] = test_dataset["text"].apply(preprocess_text)
test_dataset.head(10)

Unnamed: 0,ID,text,title,label
0,122653045997905671927713471889615536378,deputy secretary us treasury said way end plag...,us treasury deputy sec warns shortages likely ...,partially false
1,275389285957305997321446227088442471741,kabulapproximately twelve minutes us troops wi...,cnn praises taliban wearing masks attack,other
2,333248764296609831067233855420575814716,vast majority oblivious americans dismissed co...,tennessee legalized government covid kidnappin...,false
3,264019763253447756851916399533799891538,natural news theres secret layer information c...,medical shocker scientists sloan kettering dis...,false
4,158073737187690682830899773280916034317,recent study reported ncbi national institutes...,study results facemasks ineffective block tran...,false
5,104668593793347563618573389721561849271,ttav experiencing heavy censorship many social...,neutralize potential damage mrna vaccines,false
6,285745438442578180936789861954628585938,one people hesitant receive covid vaccine know...,cdc quietly admits death toll covid vaccines g...,false
7,142500044244587305036676237978793639344,us deaths related vaccines less months entire ...,exclusive per cdc nearly twice many vaccine re...,partially false
8,240988184127263059916419185466606679317,delta covid variant currently rampant united k...,fully vaccinated people higher chance death du...,false
9,212759982338406281914383909445151155412,natural news via words cdcs director dr rochel...,cdc confesses vaccines failing vaxxed superspr...,false


### Lemmatization and Stemming

In [22]:
if_lemmatize = True

In [23]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    train_dataset["title"] = train_dataset["title"].str.lower().str.replace("’", "'")
    train_dataset["title"] = train_dataset["title"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    train_dataset["text"] = train_dataset["text"].str.lower().str.replace("’", "'")
    train_dataset["text"] = train_dataset["text"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    test_dataset["title"] = test_dataset["title"].str.lower().str.replace("’", "'")
    test_dataset["title"] = test_dataset["title"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    test_dataset["text"] = test_dataset["text"].str.lower().str.replace("’", "'")
    test_dataset["text"] = test_dataset["text"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [24]:
train_text = (train_dataset['title'] + " " + train_dataset['text']).values
test_text = (test_dataset['title'] + " " + test_dataset['text']).values

In [25]:
train_labels = train_dataset['label'].values
test_labels = test_dataset['label'].values

In [26]:
label_encoder = preprocessing.LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
train_labels = to_categorical(train_labels, 4)
test_labels = label_encoder.transform(test_labels)
test_labels = to_categorical(test_labels, 4)

## Embedding with word2vec

In [27]:
# Limit length of each article
max_length = 3000
lengths = np.array([len(x) for x in train_text])
train_text = train_text[lengths < max_length]
train_labels = train_labels[lengths < max_length]

In [28]:
# Check actual max length of an article
article_length = max(np.array([len(x) for x in train_text]))
article_length

2997

In [29]:
# Word2vec model with vector size = 100
vec_size = 100

# workers - number of CPU threads
word_model = gensim.models.Word2Vec(train_text, vector_size = vec_size, window = 5, workers = 12)
word_model.train(train_text, epochs = 10, total_examples = len(train_text))
wv = word_model.wv

In [30]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
vocabulary_size = len(tokenizer.word_index) + 1
encoded_articles = tokenizer.texts_to_sequences(train_text)

In [31]:
padded_articles = pad_sequences(encoded_articles, maxlen = article_length, padding='post')

In [32]:
padded_articles.shape

(800, 2997)

In [33]:
emb_matrix = np.zeros(shape=(vocabulary_size, vec_size))
for w, i in tokenizer.word_index.items():
    ind = wv.has_index_for(w)
    if ind:
        emb_matrix[i] = wv.get_vector(w)

In [34]:
x_train, x_test, y_train, y_test = train_test_split(padded_articles, train_labels, test_size=0.25)

## LSTM model

In [35]:
from keras.initializers import Constant
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, LSTM
from keras.layers import ReLU
from keras.layers import Dropout
from tensorflow.keras.losses import CategoricalCrossentropy

model=Sequential()
model.add(Embedding(input_dim = vocabulary_size, 
                    output_dim = vec_size,
                    input_length = article_length,
                    embeddings_initializer = Constant(emb_matrix))
         )
model.add(LSTM(16, return_sequences=True))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(8,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(4, activation='softmax'))

In [36]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2997, 100)         1713300   
                                                                 
 lstm (LSTM)                 (None, 2997, 16)          7488      
                                                                 
 flatten (Flatten)           (None, 47952)             0         
                                                                 
 dense (Dense)               (None, 32)                1534496   
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 8)                 264       
                                                                 
 dropout_1 (Dropout)         (None, 8)                 0

In [37]:
import keras.backend as K
def keras_f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [38]:
model.compile(optimizer=Adam(),loss=CategoricalCrossentropy(), metrics=['accuracy'])

## Training

In [39]:
epochs=10
batch_size=32

In [40]:
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ec9f32ec48>

In [41]:
train_pred = np.argmax(model.predict(x_train), axis=1)
train_truth = np.argmax(y_train, axis=1)



In [42]:
# Accuracy
accuracy_score(train_truth, train_pred)

0.8866666666666667

In [43]:
# Balanced accuracy
balanced_accuracy_score(train_pred, train_truth)

0.9166415923113651

## Applying on test dataset

In [44]:
# Limit length of each test article
lengths = np.array([len(x) for x in test_text])
test_text = test_text[lengths <= article_length]
test_labels = test_labels[lengths <= article_length]

In [45]:
encoded_test_articles = tokenizer.texts_to_sequences(test_text)

In [46]:
padded_test_articles = pad_sequences(encoded_test_articles, maxlen = article_length, padding='post')

In [47]:
test_pred = np.argmax(model.predict(padded_test_articles), axis=1)
test_truth = np.argmax(test_labels, axis=1)



In [48]:
# Accuracy
accuracy_score(test_pred, test_truth)

0.4581005586592179

In [49]:
# Balanced accuracy
balanced_accuracy_score(test_truth, test_pred)

0.31610367984775567