In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import string

In [2]:
#Reading the 5 csv files, each one containts 160 reviews - 80 Real with label 0 and 80 false with label 1
fold1 = pd.read_csv("fold1.csv")
fold2 = pd.read_csv("fold2.csv")
fold3 = pd.read_csv("fold3.csv")
fold4 = pd.read_csv("fold4.csv")
fold5 = pd.read_csv("fold5.csv")

fold1.head(3)

#create a combined dataset for analysis
train_df = pd.concat([fold1,fold2,fold3,fold4], ignore_index=True)
whole_df = pd.concat([fold1,fold2,fold3,fold4,fold5], ignore_index=True)
whole_df.head(3)

Unnamed: 0,Review,Fake
0,We stayed at the Schicago Hilton for 4 days an...,1
1,My $200 Gucci sunglasses were stolen out of my...,0
2,Hotel is located 1/2 mile from the train stati...,1


In [3]:
#LOWERCASE
def custom_lowercase(text):
    """custom function to lowercase the words that only their first character is capital"""
    new_text = ""
    for word in str(text).split():
        if len(word)>1:
            if word[0].isupper() and word[1].islower():
                word = word[0].lower() + word[1:]
                new_text = new_text +" "+ word
            else:
                new_text = new_text +" "+ word
        elif word[0].isupper():
                word = word[0].lower() 
                new_text = new_text +" "+ word
        else:
            new_text = new_text +" "+ word
    return new_text

whole_df["Review"] = whole_df["Review"].apply(lambda text: custom_lowercase(text))

In [4]:
#CONTRACTIONS
import contractions
whole_df["Review"] = whole_df["Review"].apply(lambda x: contractions.fix(x))

In [5]:
#REMOVE PUNCTUATION
#the "0" index of the list is "!" so we skip it
punctuation_list = string.punctuation[1:]
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', punctuation_list))

whole_df["Review"] = whole_df["Review"].apply(lambda text: remove_punctuation(text))

In [6]:
#REMOVE STOPWORDS
# nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
whole_df["Review"] = whole_df["Review"].apply(lambda text: remove_stopwords(text))

In [7]:
#REMOVE FREQUENT WORDS
from collections import Counter

#seperate the dataframes
fake_df = whole_df.loc[whole_df['Fake'] == 1]
real_df = whole_df.loc[whole_df['Fake'] == 0]

def counts(df):
    cnt = Counter()
    for text in df["Review"].values:
        for word in text.split():
            cnt[word] += 1
    return cnt

fake_cnts = counts(fake_df)
real_cnts = counts(real_df)
        
def get_list(lis):
    words = []
    for pair in lis:
        words.append(pair[0])
    return words

fake_common = get_list(fake_cnts.most_common(20))
real_common = get_list(real_cnts.most_common(20))
freq_to_remove = list(set(fake_common).intersection(real_common))

def remove_freq(text, freq):
    """custom function to remove freq words"""
    return " ".join([word for word in str(text).split() if word not in freq])

whole_df["Review"] = whole_df["Review"].apply(lambda text: remove_freq(text,freq_to_remove))

In [8]:
#LEMMATIZATION

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

whole_df["Review"] = whole_df["Review"].apply(lambda text: lemmatize_words(text))

In [8]:
#STEMMING
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

whole_df["Review"] = whole_df["Review"].apply(lambda text: stem_words(text))

In [8]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

X = whole_df['Review']
Y = whole_df['Fake']

uni_vector = TfidfVectorizer(lowercase=False, ngram_range=(1, 1))
X = uni_vector.fit_transform(X).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle = False)

dec_tree = tree.DecisionTreeClassifier()
params = {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
          'min_samples_split': [2,5,10,15,20],
          'min_samples_leaf': [1,2,4,8,10,20]}

clf_GS = GridSearchCV(dec_tree, params, verbose=1, cv=5)
clf_GS.fit(X_train, y_train)

print(clf_GS.best_estimator_)

y_pred = clf_GS.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 330 candidates, totalling 1650 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1650 out of 1650 | elapsed:  5.7min finished


DecisionTreeClassifier(max_depth=60, min_samples_leaf=4, min_samples_split=20)
[[52 28]
 [32 48]]
              precision    recall  f1-score   support

           0       0.62      0.65      0.63        80
           1       0.63      0.60      0.62        80

    accuracy                           0.62       160
   macro avg       0.63      0.62      0.62       160
weighted avg       0.63      0.62      0.62       160

0.625


In [9]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

X = whole_df['Review']
Y = whole_df['Fake']

bi_vector = TfidfVectorizer(lowercase=False, ngram_range=(1, 2))
X = bi_vector.fit_transform(X).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle = False)

dec_tree = tree.DecisionTreeClassifier()
params = {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
          'min_samples_split': [2,5,10,15,20],
          'min_samples_leaf': [1,2,4,8,10,20]}

clf_GS = GridSearchCV(dec_tree, params, verbose=1, cv=5)
clf_GS.fit(X_train, y_train)

print(clf_GS.best_estimator_)

y_pred = clf_GS.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 330 candidates, totalling 1650 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1650 out of 1650 | elapsed: 36.0min finished


DecisionTreeClassifier(max_depth=80, min_samples_leaf=10, min_samples_split=5)
[[45 35]
 [34 46]]
              precision    recall  f1-score   support

           0       0.57      0.56      0.57        80
           1       0.57      0.57      0.57        80

    accuracy                           0.57       160
   macro avg       0.57      0.57      0.57       160
weighted avg       0.57      0.57      0.57       160

0.56875
