In [None]:
#  this code reference to https://www.kaggle.com/code/lunamcbride24/covid19-tweet-truth-analysis


import numpy as np  
import pandas as pd  

import nltk 
from nltk.corpus import stopwords  

import re 
import html  
import string 

import tensorflow as tf  
from tensorflow.keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences  
import tensorflow.keras.layers as L  
from tensorflow.keras.optimizers import Adam  

from tensorflow.keras.losses import SparseCategoricalCrossentropy  
from sklearn.model_selection import train_test_split  
 
import os
 

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# Read Data & Data Clean

In [None]:
dataset_test = (pd.read_csv("test_dataset_pure.csv")[lambda x: (x['status'] == 'real') | (x['status'] == 'fake')])

In [None]:
dataset_train = (pd.read_csv("train_dataset_pure.csv", encoding= 'unicode_escape')[lambda x: (x['status'] == 'real') | (x['status'] == 'fake')])


In [None]:
dataset_org = (pd.read_csv("all_sentences.csv", encoding= 'unicode_escape')[lambda x: (x['status'] == 'real') | (x['status'] == 'fake')])

In [None]:
# twTrain = pd.read_csv("Constraint_Train/Constraint_Train.csv") #Load the tweet (tw) training set
# twValid = pd.read_csv("Constraint_Val.csv") #Load the tweet (tw) validation set
# twTest = pd.read_csv("Constraint_Test.csv") #Load the tweet (tw) testing set

In [None]:
# Remove stopword and punctionaltion
punctuations = string.punctuation #List of punctuations to remove

STOP = stopwords.words("english") #Get the NLTK stopwords


In [None]:
def cleanTweets(line):
  line = line.str.replace("<br />",'', case=False)
  line = line.str.lower()
  line = line.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True).replace(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", regex=True)
  line = line.astype(str).apply(lambda x: ' '.join([word for word in x.split() if not word in STOP]))
  return line

In [None]:
dataset_train["cleanTweet"] = cleanTweets(dataset_train["contents"]) #Clean the dataset
dataset_train["encodedLabel"] = pd.get_dummies(dataset_train["status"])["real"] #Get the encoded labels from the "real" dummies

In [None]:
dataset_test['cleanTweet'] = cleanTweets(dataset_test["contents"])
dataset_test["encodedLabel"] = pd.get_dummies(dataset_test["status"])["real"]

In [None]:
dataset_org['cleanTweet'] = cleanTweets(dataset_org["contents"])
dataset_org["encodedLabel"] = pd.get_dummies(dataset_org["status"])["real"]

In [None]:
from nltk.stem import WordNetLemmatizer
  
tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

def lemma(x):
    res = ''
    for w in tokenizer.tokenize(x):
        res = res + lemmatizer.lemmatize(w) + ' '
    
    res = res[:-1] 
    return res

In [None]:
dataset_train["cleanTweet"] = (dataset_train["cleanTweet"].apply(lambda x: lemma(x)))
dataset_test["cleanTweet"] = (dataset_test["cleanTweet"].apply(lambda x: lemma(x)))

In [None]:
dataset_org["cleanTweet"] = (dataset_org["cleanTweet"].apply(lambda x: lemma(x)))

In [None]:
combine_df = [dataset_train, dataset_test]
result = pd.concat(combine_df)

# TFIDF

In [None]:
# perform TFIDF feature extraction
# from sklearn.feature_extraction.text import TfidfVectorizer

# combined = dataset2["cleanTweet"].copy().append(dataset2["cleanTweet"].copy())
# tfIdfVectorizer=TfidfVectorizer(use_idf=True)
# dataset_tfidf = tfIdfVectorizer.fit_transform(combined)

In [None]:
# perform TFIDF feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdfVectorizer = TfidfVectorizer(ngram_range=(1,3), max_df=0.75, min_df=1)
# dataset_tfidf = tfIdfVectorizer.fit_transform(dataset_org.cleanTweet)
# dataset_tfidf = tfIdfVectorizer.fit_transform(dataset2.cleanTweet)

In [None]:
combine_tfidf = tfIdfVectorizer.fit_transform(result.cleanTweet)

In [None]:
combine_tfidf.shape

(8972, 234941)

# Simple Model

In [None]:
test_tfidf = combine_tfidf[len(dataset_train):]
train_tfidf = combine_tfidf[:len(dataset_train)]
print(train_tfidf.shape)

(7182, 234941)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset_tfidf,
                            dataset_train.encodedLabel,
                            test_size=0.2, 
                            random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train,
                            y_train,
                            test_size=0.2, 
                            random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset_tfidf,
                            dataset_org.encodedLabel,
                            test_size=0.2, 
                            random_state=42)

In [None]:
X_train = train_tfidf
y_train = dataset_train.encodedLabel

In [None]:
X_test = test_tfidf
y_test = dataset_test.encodedLabel

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn import metrics
from sklearn.metrics import classification_report
from itertools import islice
from sklearn.metrics import recall_score

In [None]:
p = Perceptron()
p.fit((X_train), y_train)

# Given the model, predict on test split
predicted = p.predict((X_test))

# Calculate matrix
accuracy = metrics.accuracy_score(y_test, predicted)

# Printing mertrix in each class    
print("Average Perceptron accuracy = " + str (accuracy))
print("Average Perceptron f1score = " + str (f1_score(y_test, predicted, average=None)))
print("Average Recall = " + str(recall_score(y_test, predicted, average=None)))

Average Perceptron accuracy = 0.9195530726256983
Average Perceptron f1score = [0.90082645 0.93233083]
Average Recall = [0.88738128 0.94207028]


In [None]:
# svm
from sklearn import svm

clf = svm.LinearSVC() 
clf.fit(X_train, y_train)

predicted = clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted)

print("Average SVM accuracy:" + str(accuracy))
print("Average SVM f1score = " + str (f1_score(y_test, predicted, average=None)))
print("Average Recall = " + str(recall_score(y_test, predicted, average=None)))


Average SVM accuracy:0.923463687150838
Average SVM f1score = [0.90519031 0.93583138]
Average Recall = [0.88738128 0.94871795]


In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression( max_iter=100 )

clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, predicted)

print("Average LogisticRegression accuracy:" + str(accuracy))
print("Average LogisticRegression f1score = " + str (f1_score(y_test, predicted, average=None)))
print("Average Recall = " + str(recall_score(y_test, predicted, average=None)))

Average LogisticRegression accuracy:0.8810055865921788
Average LogisticRegression f1score = [0.84349743 0.90401082]
Average Recall = [0.77883311 0.95251662]


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, predicted)

print("Average RandomForestClassifier accuracy:" + str(accuracy))
print("Average RandomForestClassifier f1score = " + str (f1_score(y_test, predicted, average=None)))
print("Average Recall = " + str(recall_score(y_test, predicted, average=None)))

Average RandomForestClassifier accuracy:0.588268156424581
Average RandomForestClassifier f1score = [0.        0.7407668]
Average Recall = [0. 1.]


In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, predicted)

print("Average KNeighborsClassifier accuracy:" + str(accuracy))
print("Average KNeighborsClassifier f1score = " + str (f1_score(y_test, predicted, average=None)))
print("Average Recall = " + str(recall_score(y_test, predicted, average=None)))

Average KNeighborsClassifier accuracy:0.8368715083798882
Average KNeighborsClassifier f1score = [0.82535885 0.84696017]
Average Recall = [0.93622795 0.76733143]


In [None]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)
predicted = naive_bayes_classifier.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted)

print("Average NaiveBayes accuracy:" + str(accuracy))
print("Average NaiveBayes f1score = " + str (f1_score(y_test, predicted, average=None)))
print("Average Recall = " + str(recall_score(y_test, predicted, average=None)))

Average NaiveBayes accuracy:0.8167597765363128
Average NaiveBayes f1score = [0.71577123 0.86479802]
Average Recall = [0.56037992 0.99620133]
