In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import string

In [2]:
#Reading the 5 csv files, each one containts 160 reviews - 80 Real with label 0 and 80 false with label 1
fold1 = pd.read_csv("fold1.csv")
fold2 = pd.read_csv("fold2.csv")
fold3 = pd.read_csv("fold3.csv")
fold4 = pd.read_csv("fold4.csv")
fold5 = pd.read_csv("fold5.csv")

fold1.head(3)

#create a combined dataset for analysis
train_df = pd.concat([fold1,fold2,fold3,fold4], ignore_index=True)
whole_df = pd.concat([fold1,fold2,fold3,fold4,fold5], ignore_index=True)
whole_df.head(3)

Unnamed: 0,Review,Fake
0,We stayed at the Schicago Hilton for 4 days an...,1
1,My $200 Gucci sunglasses were stolen out of my...,0
2,Hotel is located 1/2 mile from the train stati...,1


In [3]:
#LOWERCASE
def custom_lowercase(text):
    """custom function to lowercase the words that only their first character is capital"""
    new_text = ""
    for word in str(text).split():
        if len(word)>1:
            if word[0].isupper() and word[1].islower():
                word = word[0].lower() + word[1:]
                new_text = new_text +" "+ word
            else:
                new_text = new_text +" "+ word
        elif word[0].isupper():
                word = word[0].lower() 
                new_text = new_text +" "+ word
        else:
            new_text = new_text +" "+ word
    return new_text

whole_df["Review"] = whole_df["Review"].apply(lambda text: custom_lowercase(text))

In [4]:
#CONTRACTIONS
import contractions
whole_df["Review"] = whole_df["Review"].apply(lambda x: contractions.fix(x))

In [5]:
#REMOVE PUNCTUATION
#the "0" index of the list is "!" so we skip it
punctuation_list = string.punctuation[1:]
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', punctuation_list))

whole_df["Review"] = whole_df["Review"].apply(lambda text: remove_punctuation(text))

In [6]:
#REMOVE STOPWORDS
# nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
whole_df["Review"] = whole_df["Review"].apply(lambda text: remove_stopwords(text))

In [7]:
#REMOVE FREQUENT WORDS
from collections import Counter

#seperate the dataframes
fake_df = whole_df.loc[whole_df['Fake'] == 1]
real_df = whole_df.loc[whole_df['Fake'] == 0]

def counts(df):
    cnt = Counter()
    for text in df["Review"].values:
        for word in text.split():
            cnt[word] += 1
    return cnt

fake_cnts = counts(fake_df)
real_cnts = counts(real_df)
        
def get_list(lis):
    words = []
    for pair in lis:
        words.append(pair[0])
    return words

fake_common = get_list(fake_cnts.most_common(20))
real_common = get_list(real_cnts.most_common(20))
freq_to_remove = list(set(fake_common).intersection(real_common))

def remove_freq(text, freq):
    """custom function to remove freq words"""
    return " ".join([word for word in str(text).split() if word not in freq])

whole_df["Review"] = whole_df["Review"].apply(lambda text: remove_freq(text,freq_to_remove))

In [8]:
#LEMMATIZATION

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

whole_df["Review"] = whole_df["Review"].apply(lambda text: lemmatize_words(text))

In [8]:
#STEMMING
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

whole_df["Review"] = whole_df["Review"].apply(lambda text: stem_words(text))

In [9]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

X = whole_df['Review']
Y = whole_df['Fake']

uni_vector = TfidfVectorizer(lowercase=False, ngram_range=(1, 1))
X = uni_vector.fit_transform(X).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle = False)

ran_tree = RandomForestClassifier()
params = {'max_depth': [50, 60, 70, 80, 90, 100],
          'min_samples_leaf': [3, 4, 5],
          'min_samples_split': [8, 10, 12],
          'n_estimators': [100, 200, 300, 1000, 2000]}

clf_GS = GridSearchCV(ran_tree, params, cv = 3, n_jobs = -1, verbose = 2)
clf_GS.fit(X_train, y_train)

print(clf_GS.best_estimator_)

y_pred = clf_GS.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

Fitting 3 folds for each of 270 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:  7.9min finished


RandomForestClassifier(max_depth=70, min_samples_leaf=4, min_samples_split=10,
                       n_estimators=1000)
[[72  8]
 [18 62]]
              precision    recall  f1-score   support

           0       0.80      0.90      0.85        80
           1       0.89      0.78      0.83        80

    accuracy                           0.84       160
   macro avg       0.84      0.84      0.84       160
weighted avg       0.84      0.84      0.84       160

0.8375


In [10]:
X = whole_df['Review']
Y = whole_df['Fake']

bi_vector = TfidfVectorizer(lowercase=False, ngram_range=(1, 2))
X = bi_vector.fit_transform(X).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle = False)

ran_tree = RandomForestClassifier()
params = {'max_depth': [50, 60, 70, 80, 90, 100],
          'min_samples_leaf': [3, 4, 5],
          'min_samples_split': [8, 10, 12],
          'n_estimators': [100, 200, 300, 1000, 2000]}

clf_GS = GridSearchCV(ran_tree, params, cv = 3, n_jobs = -1, verbose = 2)
clf_GS.fit(X_train, y_train)

print(clf_GS.best_estimator_)

y_pred = clf_GS.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

Fitting 3 folds for each of 270 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed: 23.9min finished


RandomForestClassifier(max_depth=90, min_samples_leaf=3, min_samples_split=8,
                       n_estimators=2000)
[[71  9]
 [19 61]]
              precision    recall  f1-score   support

           0       0.79      0.89      0.84        80
           1       0.87      0.76      0.81        80

    accuracy                           0.82       160
   macro avg       0.83      0.82      0.82       160
weighted avg       0.83      0.82      0.82       160

0.825
