# Yelp Fake Review Detection 03182023

## 1. Combine review content with labeling

### a) loading reviewContent

In [217]:
import pandas as pd
file_review="D:/Group Project/YelpZip/reviewContent.txt"
review=pd.read_csv(file_review,sep="\t",header=None)
review.columns=['user_id', 'prod_id', 'date', 'review']

In [218]:
print(review.head())
print("number of rows of reviewContent:",len(review))

   user_id  prod_id        date  \
0     5044        0  2014-11-16   
1     5045        0  2014-09-08   
2     5046        0  2013-10-06   
3     5047        0  2014-11-30   
4     5048        0  2014-08-28   

                                              review  
0  Drinks were bad, the hot chocolate was watered...  
1  This was the worst experience I've ever had a ...  
2  This is located on the site of the old Spruce ...  
3  I enjoyed coffee and breakfast twice at Toast ...  
4  I love Toast! The food choices are fantastic -...  
number of rows of reviewContent: 608458


#### For some reason, the Notepad++ reads 608598 rows from reviewContent, but the python only reads 608458 rows.

### b) loading metadata

In [219]:
file_meta="D:/Group Project/YelpZip/metadata.txt"
meta=pd.read_csv(file_meta,sep="\t",header=None)
meta.columns=["user_id","prod_id","rating","label","date"]

In [220]:
print(meta.head())
print("number of rows of meta:",len(meta))

   user_id  prod_id  rating  label        date
0     5044        0     1.0     -1  2014-11-16
1     5045        0     1.0     -1  2014-09-08
2     5046        0     3.0     -1  2013-10-06
3     5047        0     5.0     -1  2014-11-30
4     5048        0     5.0     -1  2014-08-28
number of rows of meta: 608598


### c) combine the data

In [221]:
join_data=review.merge(meta,on=["user_id","date","prod_id"],how="left")
print(join_data.head())
print("length of join_data:",len(join_data))

   user_id  prod_id        date  \
0     5044        0  2014-11-16   
1     5045        0  2014-09-08   
2     5046        0  2013-10-06   
3     5047        0  2014-11-30   
4     5048        0  2014-08-28   

                                              review  rating  label  
0  Drinks were bad, the hot chocolate was watered...     1.0     -1  
1  This was the worst experience I've ever had a ...     1.0     -1  
2  This is located on the site of the old Spruce ...     3.0     -1  
3  I enjoyed coffee and breakfast twice at Toast ...     5.0     -1  
4  I love Toast! The food choices are fantastic -...     5.0     -1  
length of join_data: 608458


### The length of combined data is 608458.

### d) export combined data to txt

In [222]:
from numpy import savetxt
savetxt("orig_review_with_labeling.txt",join_data,fmt="%s",delimiter="\t",encoding="utf-8")

#### Combining the "orig_review_with_labeling.txt" with metadata again can generate dataset with 608598 rows without missing samples.

In [229]:
file_review2="C:/Users/Lu/PycharmProjects/Group_Project/orig_review_with_labeling.txt"
review2=pd.read_csv(file_review2,sep="\t",header=None)
review2.columns=['user_id', 'prod_id', 'date', 'review', 'rating', 'label']
print(review2.head())
print("number of rows of reviewContent:",len(review2))
print("check the null data:",review2.isnull().values.any())

   user_id  prod_id        date  \
0     5044        0  2014-11-16   
1     5045        0  2014-09-08   
2     5046        0  2013-10-06   
3     5047        0  2014-11-30   
4     5048        0  2014-08-28   

                                              review  rating  label  
0  Drinks were bad, the hot chocolate was watered...     1.0   -1.0  
1  This was the worst experience I've ever had a ...     1.0   -1.0  
2  This is located on the site of the old Spruce ...     3.0   -1.0  
3  I enjoyed coffee and breakfast twice at Toast ...     5.0   -1.0  
4  I love Toast! The food choices are fantastic -...     5.0   -1.0  
number of rows of reviewContent: 608598
check the null data: True


#### Though 608598 rows are read by python from the combined data this time, there are missing values. So the two datasets are combined again.

In [230]:
file_meta2="D:/Group Project/YelpZip/metadata.txt"
meta2=pd.read_csv(file_meta2,sep="\t",header=None)
meta2.columns=["user_id","prod_id","rating","label","date"]
print(meta2.head())
print("number of rows of meta:",len(meta2))

   user_id  prod_id  rating  label        date
0     5044        0     1.0     -1  2014-11-16
1     5045        0     1.0     -1  2014-09-08
2     5046        0     3.0     -1  2013-10-06
3     5047        0     5.0     -1  2014-11-30
4     5048        0     5.0     -1  2014-08-28
number of rows of meta: 608598


In [231]:
review2=review2.drop(["label","rating"],axis=1)
join_data2=review2.merge(meta2,on=["user_id","date","prod_id"],how="left")
print(join_data2.head())
print("length of join_data:",len(join_data2))
print("check the null data:",join_data2.isnull().values.any())

   user_id  prod_id        date  \
0     5044        0  2014-11-16   
1     5045        0  2014-09-08   
2     5046        0  2013-10-06   
3     5047        0  2014-11-30   
4     5048        0  2014-08-28   

                                              review  rating  label  
0  Drinks were bad, the hot chocolate was watered...     1.0     -1  
1  This was the worst experience I've ever had a ...     1.0     -1  
2  This is located on the site of the old Spruce ...     3.0     -1  
3  I enjoyed coffee and breakfast twice at Toast ...     5.0     -1  
4  I love Toast! The food choices are fantastic -...     5.0     -1  
length of join_data: 608598
check the null data: False


In [232]:
savetxt("orig_review_with_labeling_608598rows.txt",join_data2,fmt="%s",delimiter="\t",encoding="utf-8")

## 2. Preprocessing of the review content

In [278]:
file_review3="C:/Users/Lu/PycharmProjects/Group_Project/orig_review_with_labeling_608598rows.txt"
review3=pd.read_csv(file_review3,sep="\t",header=None)
review3.columns=['user_id', 'prod_id', 'date', 'review', 'rating', 'label']
print(review3.head())
print("length of data:",len(review3))
print("check the null data:",review3.isnull().values.any())

   user_id  prod_id        date  \
0     5044        0  2014-11-16   
1     5045        0  2014-09-08   
2     5046        0  2013-10-06   
3     5047        0  2014-11-30   
4     5048        0  2014-08-28   

                                              review  rating  label  
0  Drinks were bad, the hot chocolate was watered...     1.0     -1  
1  This was the worst experience I've ever had a ...     1.0     -1  
2  This is located on the site of the old Spruce ...     3.0     -1  
3  I enjoyed coffee and breakfast twice at Toast ...     5.0     -1  
4  I love Toast! The food choices are fantastic -...     5.0     -1  
length of data: 608598
check the null data: False


In [279]:
data=review3.copy()
print(len(data.columns))

6


In [287]:
import string
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
length=len(data)
empty_col=np.empty([length,1])
data=np.append(data,empty_col,1)
print(data.shape)
for i in range(length):
    # a.lower the text
    text=data[i,3].lower()
    
    # b.remove white spaces if there's any        No difference after tokenization, so this step can be removed
    # text = text.strip()
    
    # b.contraction words
    text = text.replace("_", " ")
    text = re.sub(br'(\xc2)(.)', b'', text.encode('utf-8')).decode()
    text = text.replace("can't", "can not")
    text = text.replace("won't", "will not")
    text = text.replace("'ve"," have")
    text = text.replace("'d"," had")
    text = text.replace("'m", " am")
    text = text.replace("'ll", " will")
    text = text.replace("'s", " is")
    text = text.replace("n't", " not")
    text = text.replace("'re", " are")
    text = text.replace("st.", "street")
    text = text.replace("bldg.", "building") 
    
    # c.deal with punctuation such as ‘!”#$%&'()*+,-./:;?@[\]^_`{|}~’, and including "...","???"...
    text=re.sub(r"[^\w\s]", " ", text) 

#     # d.remove punctuation marks such as ‘!”#$%&'()*+,-./:;?@[\]^_`{|}~’
#     text = "".join([i for i in text if i not in string.punctuation])

    # d.remove numbers
    text = re.sub(r"\d+", "", text)

    # e.tokenization
    word_tokens = word_tokenize(text)  #this is a list

    # f.remove stopwords
    stop_words = stopwords.words("english")
    text=[w for w in word_tokens if w not in stop_words]

    # g.lemmatization
    text = [WordNetLemmatizer().lemmatize(word,"a") for word in text]
    text = [WordNetLemmatizer().lemmatize(word, "v") for word in text]
    text = [WordNetLemmatizer().lemmatize(word, "n") for word in text]
    text = [WordNetLemmatizer().lemmatize(word, "s") for word in text]
    text = [WordNetLemmatizer().lemmatize(word, "r") for word in text]

    data[i,6]=text
#print(data[0:5])

(608598, 7)


### It takes a while to run the preprocessing of every text content, so the output was saved to txt file for convenience.

In [289]:
savetxt("orig_review_with_labeling_608598rows_af_lemma.txt",data,fmt="%s",delimiter="\t",encoding="utf-8")

## 3. Creating features

In [290]:
file_review4="D:/Lu Yu/orig_review_with_labeling_608598rows_af_lemma.txt"
review4=pd.read_csv(file_review4,sep="\t",header=None)
review4.columns=['user_id', 'prod_id', 'date', 'review', 'rating', 'label','lemma']
print(review4.head())
print("length of data:",len(review4))
print("check the null data:",review4.isnull().values.any())

   user_id  prod_id        date  \
0     5044        0  2014-11-16   
1     5045        0  2014-09-08   
2     5046        0  2013-10-06   
3     5047        0  2014-11-30   
4     5048        0  2014-08-28   

                                              review  rating  label  \
0  Drinks were bad, the hot chocolate was watered...     1.0     -1   
1  This was the worst experience I've ever had a ...     1.0     -1   
2  This is located on the site of the old Spruce ...     3.0     -1   
3  I enjoyed coffee and breakfast twice at Toast ...     5.0     -1   
4  I love Toast! The food choices are fantastic -...     5.0     -1   

                                               lemma  
0  ['drink', 'bad', 'hot', 'chocolate', 'water', ...  
1  ['bad', 'experience', 'ever', 'casual', 'coffe...  
2  ['locate', 'site', 'old', 'spruce', 'street', ...  
3  ['enjoy', 'coffee', 'breakfast', 'twice', 'toa...  
4  ['love', 'toast', 'food', 'choice', 'fantastic...  
length of data: 608598
check the

### Checking if some words are still in the word list

In [301]:
lemma_test=review4[['lemma']]
count=0
for i in range(len(review4)):
    if 'drink' in lemma_test.iloc[i][0]:
        #print("i:",i)
        count+=1
print("count:",count)

count: 79244


In [299]:
lemma_test=review4[['lemma']]
count=0
for i in range(len(review4)):
    if 'xadinterest' in lemma_test.iloc[i][0]:
        print("i:",i)
        count+=1
print("count:",count)

count: 0


### The words start with x, such as xad has already been removed

In [294]:
vectorizer = TfidfVectorizer()
vectorizer.fit(review4.lemma)
sorted(vectorizer.get_feature_names_out()[-3500:-2000])

['woww',
 'wowwed',
 'wowwee',
 'wowwer',
 'wowwie',
 'wowwing',
 'wowwowow',
 'wowww',
 'wowwww',
 'wowwwww',
 'wowwwwwed',
 'wowwwwweee',
 'wowwwwww',
 'wowwwwwww',
 'wowwwwwwww',
 'wowwwwwwwww',
 'wowwwwwwwwwwww',
 'wowwy',
 'wowy',
 'wowza',
 'wowzaaa',
 'wowzah',
 'wowzas',
 'wowzer',
 'wowzers',
 'wowzerz',
 'wowzie',
 'wowzzz',
 'wp',
 'wpa',
 'wpb',
 'wpe',
 'wph',
 'wphilly',
 'wpicy',
 'wplenty',
 'wps',
 'wpstreetcom',
 'wpu',
 'wpunj',
 'wpxo',
 'wq',
 'wqas',
 'wqs',
 'wqwqdesert',
 'wr',
 'wra',
 'wrack',
 'wrackedly',
 'wram',
 'wrangle',
 'wrangler',
 'wrap',
 'wraparound',
 'wrapepd',
 'wrapido',
 'wrappd',
 'wrappe',
 'wrappede',
 'wrappen',
 'wrapper',
 'wrapping',
 'wrappper',
 'wrassle',
 'wrassler',
 'wrasslers',
 'wrath',
 'wray',
 'wrd',
 'wre',
 'wreak',
 'wreath',
 'wreathe',
 'wreck',
 'wreckage',
 'wreckbass',
 'wrecker',
 'wreckfish',
 'wreckless',
 'wreckroom',
 'wree',
 'wren',
 'wrench',
 'wrenchingly',
 'wrere',
 'wresle',
 'wrest',
 'wrestle',
 'wrestl

### Some strange words written by reviewers are still remained: 

In [297]:
sorted(vectorizer.get_feature_names_out()[0:20])

['aa',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaaaaa',
 'aaaaaaaaaaaa',
 'aaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaasaaaaaaaaaammmmmmmmmmmmmaaaaaaaaaa',
 'aaaaaaaaaaaaaand',
 'aaaaaaaaaaaaah',
 'aaaaaaaaaaaallll',
 'aaaaaaaaaaaamazing',
 'aaaaaaaaaaamazing',
 'aaaaaaaaaarrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr',
 'aaaaaaaaamazing',
 'aaaaaaaaand',
 'aaaaaaaahh']

### Attention: some of the reviews are written in other languages, such as French, Chinese, Japanese...

In [298]:
sorted(vectorizer.get_feature_names_out()[-50:])

['腌笃鲜',
 '臊子面不到刀',
 '菜品也好',
 '葱油饼',
 '薄い',
 '虽然他们比较忙',
 '蝦米腸粉',
 '蝦餃',
 '蟹粉小笼包',
 '装修也好',
 '西安美食还有成都美食',
 '要是老板可以改名字就更好了',
 '许多清淡的菜味道一样的棒',
 '豆瓣魚',
 '超正',
 '超軽量といった魅了する',
 '软骨',
 '辛いので注意',
 '还好电话没变',
 '还问我们要不要水什么的',
 '这个吃货来讲根本是无法抵抗呀',
 '这么俗的店名差点就让我错过了这里的美食',
 '这些菜还没自己在家里做的好吃也是醉了',
 '这家餐厅就像一碗甜美的汤',
 '这家餐馆相当的棒',
 '进去可以直接说普通话了',
 '這家餐廳非常好',
 '都很好',
 '酸的很奇怪',
 '酸菜',
 '酸辣鸡杂',
 '醋溜土豆丝用白醋',
 '里面是芝士加牛肉的也特别好吃',
 '量は特盛クラス',
 '銀針粉',
 '门口还有好几个排队',
 '雲丹の塩昆布焼き',
 '非常的脆',
 '非常糟糕的服务',
 '面包硬的挑战你的口腔',
 '順番がきたらチキンオーバーライスと告げ',
 '顺带一提餐厅的装修我很喜欢',
 '食べ物はとても良いです',
 '食物多为冷藏加热',
 '首先菜不新鲜',
 '魚香茄子雞',
 '鱼片和手指头差不多厚',
 '麻婆豆腐也不錯',
 '黑木耳',
 '黒糖梅酒']

In [304]:
review4=review4[['user_id', 'prod_id', 'rating', 'label','lemma']]
print(review4.head())

   user_id  prod_id  rating  label  \
0     5044        0     1.0     -1   
1     5045        0     1.0     -1   
2     5046        0     3.0     -1   
3     5047        0     5.0     -1   
4     5048        0     5.0     -1   

                                               lemma  
0  ['drink', 'bad', 'hot', 'chocolate', 'water', ...  
1  ['bad', 'experience', 'ever', 'casual', 'coffe...  
2  ['locate', 'site', 'old', 'spruce', 'street', ...  
3  ['enjoy', 'coffee', 'breakfast', 'twice', 'toa...  
4  ['love', 'toast', 'food', 'choice', 'fantastic...  


### a) create number of words, number of verbs, average word length, and emotiveness ratio

In [305]:
%%time
import nltk
from nltk import pos_tag
length=len(review4)
empty_col=np.empty([length,4])
review4=np.append(review4,empty_col,1)
for i in range(length):
    text = review4[i, 4]
    text=text[1:-1]
    text=text.replace("'","")
    text=text.split(",")
    text=[w.strip() for w in text]

    num_of_words=len(text)
    word_tag = pos_tag(text)
    num_of_verb=0
    num_of_adj=0
    num_of_adv=0
    num_of_noun=0
    sum_len_word=0
    for word, tag in word_tag:
        len_word=len(word)
        sum_len_word += len_word

        tag = tag[0:2].lower()
        if tag=="vb":
            num_of_verb+=1
        elif tag=="jj":
            num_of_adj+=1
        elif tag=="rb":
            num_of_adv+=1
        elif tag=="nn":
            num_of_noun+=1

    avg_word_len=sum_len_word/len(text)
    review4[i,5]=num_of_words
    review4[i,6]=num_of_verb
    review4[i,7]=avg_word_len

    if num_of_noun+num_of_verb!=0:
        emotiveness_ratio=(num_of_adj+num_of_adv)/(num_of_noun+num_of_verb)
    else:
        emotiveness_ratio=0
    review4[i,8]=emotiveness_ratio
    #print(num_of_words,num_of_verb,avg_word_len,emotiveness_ratio)

Wall time: 52min 20s


In [306]:
savetxt("review_features_01.txt",review4,fmt="%s",delimiter="\t",encoding="utf-8")

### b) create number of positive words, number of negative words and sentiment

In [307]:
file_review5="D:/Lu Yu/review_features_01.txt"
review5=pd.read_csv(file_review5,sep="\t",header=None)
review5.columns=['user_id', 'prod_id', 'rating', 'label','list_words','num_of_words',
                 'num_of_verbs','avg_word_len','emotiveness']
print(review5.head())
print("length of data:",len(review5))
print("check the null data:",review5.isnull().values.any())

   user_id  prod_id  rating  label  \
0     5044        0     1.0     -1   
1     5045        0     1.0     -1   
2     5046        0     3.0     -1   
3     5047        0     5.0     -1   
4     5048        0     5.0     -1   

                                          list_words  num_of_words  \
0  ['drink', 'bad', 'hot', 'chocolate', 'water', ...            17   
1  ['bad', 'experience', 'ever', 'casual', 'coffe...           115   
2  ['locate', 'site', 'old', 'spruce', 'street', ...            24   
3  ['enjoy', 'coffee', 'breakfast', 'twice', 'toa...           128   
4  ['love', 'toast', 'food', 'choice', 'fantastic...            77   

   num_of_verbs  avg_word_len  emotiveness  
0             2      4.882353     0.416667  
1            17      5.452174     0.379747  
2             3      5.291667     0.600000  
3            15      5.664062     0.465116  
4            11      5.337662     0.520000  
length of data: 608598
check the null data: False


In [308]:
file_positive="C:/Users/Lu/Desktop/Opinion Lexicon/positive-words.txt"
file_negative="C:/Users/Lu/Desktop/Opinion Lexicon/negative-words.txt"
positive_list=open(file_positive,"r").read().split()
negative_list=open(file_negative,"r").read().split()

In [309]:
empty_col2=np.empty([length,3])
review5=np.append(review5,empty_col2,1)

In [310]:
%%time
for i in range(length):
    num_positive=0
    num_negative=0
    text = review5[i, 4]
    text=text[1:-1]
    text=text.replace("'","")
    text=text.split(",")
    text=[w.strip() for w in text]
    len_text=len(text)
    for m in text:
        for n in positive_list:
            if m==n:
                num_positive+=1
        for k in negative_list:
            if m==k:
                num_negative+=1
    sentiment=(num_positive-num_negative)/len_text
    review5[i,9]=num_positive
    review5[i,10]=num_negative
    review5[i,11]=sentiment
# print(review5[0:5])

Wall time: 4h 35min 9s


In [128]:
print(review5[0:2])

[[5044 0 1.0 -1
  "['drink', 'bad', 'hot', 'chocolate', 'water', 'latte', 'burn', 'taste', 'food', 'also', 'poor', 'quality', 'service', 'bad', 'part', 'cashier', 'rude']"
  17 2 4.882352941176471 0.4166666666666667 1 5 -0.23529411764705882]
 [5045 0 1.0 -1
  "['bad', 'experience', 'ever', 'casual', 'coffee', 'light', 'fare', 'place', 'server', 'disappear', 'minute', 'talk', 'friend', 'window', 'girlfriend', 'sit', 'dumbfound', 'dude', 'nerve', 'job', 'try', 'make', 'eye', 'contact', 'clearly', 'get', 'pay', 'talk', 'bud', 'important', 'girlfriend', 'go', 'counter', 'server', 'disappear', 'back', 'another', 'minute', 'guy', 'ask', 'order', 'food', 'something', 'girl', 'counter', 'give', 'weird', 'look', 'say', 'get', 'server', 'arrive', 'back', 'look', 'table', 'laugh', 'yeah', 'leave', 'u', 'hang', 'half', 'goddamn', 'hour', 'place', 'two', 'customer', 'funny', 'retrospect', 'collective', 'incompetence', 'false', 'sense', 'entitlement', 'certainly', 'food', 'okay', 'place', 'call', 't

In [311]:
savetxt("review_features_02.txt",review5,fmt="%s",delimiter="\t",encoding="utf-8")

### c) create lexical diversity

In [312]:
file_review6="D:/Lu Yu/orig_review_with_labeling_608598rows_af_lemma.txt"
review6=pd.read_csv(file_review6,sep="\t",header=None)
review6.columns=['user_id', 'prod_id', 'date', 'review', 'rating', 'label','lemma']
review6=review6[['user_id','lemma']]
print(review6.head())

   user_id                                              lemma
0     5044  ['drink', 'bad', 'hot', 'chocolate', 'water', ...
1     5045  ['bad', 'experience', 'ever', 'casual', 'coffe...
2     5046  ['locate', 'site', 'old', 'spruce', 'street', ...
3     5047  ['enjoy', 'coffee', 'breakfast', 'twice', 'toa...
4     5048  ['love', 'toast', 'food', 'choice', 'fantastic...


In [313]:
import lexical_diversity
from lexical_diversity import lex_div as ld
length=len(review6)
df=review6.copy()
empty_col=np.empty([length,1])
df=np.append(df,empty_col,1)
print(df.shape)

(608598, 3)


In [314]:
for i in range(length):
    text = df[i, 1]
    text=text[1:-1]
    text=text.replace("'","")
    text=text.split(",")
    text=[w.strip() for w in text]
    #TTR: the ratio of the total number of different words in a language sample to the total number of words in the sample
    TTR=ld.ttr(text)
    df[i,2]=TTR
df=np.delete(df,1,1)

In [315]:
print(df[0:2])

[[5044 0.9411764705882353]
 [5045 0.8695652173913043]]


In [316]:
savetxt("feature_lexical_diversity.txt",df,fmt="%s",delimiter="\t",encoding="utf-8")

In [317]:
file_lexical="C:/Users/Lu/feature_lexical_diversity.txt"
lex_diversity=pd.read_csv(file_lexical,sep="\t",header=None)
lex_diversity.columns=['user_id', 'lexical_diversity']
print(lex_diversity.head())

   user_id  lexical_diversity
0     5044           0.941176
1     5045           0.869565
2     5046           0.916667
3     5047           0.750000
4     5048           0.831169


### d) create typo ratio

In [322]:
file_review7="D:/Lu Yu/orig_review_with_labeling_608598rows_af_lemma.txt"
review7=pd.read_csv(file_review6,sep="\t",header=None)
review7.columns=['user_id', 'prod_id', 'date', 'review', 'rating', 'label','lemma']
df_typo=review7.copy()[['user_id','review']]
print(df_typo.head())

   user_id                                             review
0     5044  Drinks were bad, the hot chocolate was watered...
1     5045  This was the worst experience I've ever had a ...
2     5046  This is located on the site of the old Spruce ...
3     5047  I enjoyed coffee and breakfast twice at Toast ...
4     5048  I love Toast! The food choices are fantastic -...


In [323]:
%%time
from spellchecker import SpellChecker
import string
import nltk
import re
length=len(df_typo)
empty_col=np.empty([length,1])
df_typo=np.append(df_typo,empty_col,1)
spell = SpellChecker()

for i in range(len(df_typo)):
    # a.lower the text
    text=df_typo[i,1].lower()

    # b.contraction words
    text = text.replace("can't", "can not")
    text = text.replace("won't", "will not")
    text = text.replace("'ve"," have")
    text = text.replace("'d"," had")
    text = text.replace("'m", " am")
    text = text.replace("'ll", " will")
    text = text.replace("'s", " is")
    text = text.replace("n't", " not")
    text = text.replace("'re", " are")
    text = text.replace("st.", "street")
    text = text.replace("bldg.", "building") 
    
    # c.deal with punctuation such as ‘!”#$%&'()*+,-./:;?@[\]^_`{|}~’, and including "...","???"...
    text=re.sub(r"[^\w\s]", " ", text) 
    
    # d.remove numbers
    text = re.sub(r"\d+", "", text)
    
    text = text.split()

    len_word=len(text)
    if len_word==0:
        typo_ratio=0
    else:
        misspelled = list(spell.unknown(text))
        typo_ratio=len(misspelled)/len_word
    df_typo[i,2]=typo_ratio

df_typo=np.delete(df_typo,1,1)

Wall time: 2min 53s


In [324]:
print(df_typo[0:5])

[[5044 0.0]
 [5045 0.0]
 [5046 0.04]
 [5047 0.008547008547008548]
 [5048 0.0]]


In [325]:
savetxt("feature_typo_ratio.txt",df_typo,fmt="%s",delimiter="\t",encoding="utf-8")

In [326]:
file_typo="D:/Lu Yu/feature_typo_ratio.txt"
typo=pd.read_csv(file_typo,sep="\t",header=None)
typo.columns=['user_id', 'typo_ratio']
print(typo.head())

   user_id  typo_ratio
0     5044    0.000000
1     5045    0.000000
2     5046    0.040000
3     5047    0.008547
4     5048    0.000000


## e) TF-IDF

### a) baseline svm model without TF-IGF

In [327]:
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [329]:
file_review8="D:/Lu Yu/review_features_02.txt"
review8=pd.read_csv(file_review8,sep="\t",header=None)
review8.columns=['user_id', 'prod_id', 'rating', 'label','list_words','num_of_words',
                 'num_of_verbs','avg_word_len','emotiveness','num_positive','num_negative','sentiment']
print(review8.head())

   user_id  prod_id  rating  label  \
0     5044        0     1.0     -1   
1     5045        0     1.0     -1   
2     5046        0     3.0     -1   
3     5047        0     5.0     -1   
4     5048        0     5.0     -1   

                                          list_words  num_of_words  \
0  ['drink', 'bad', 'hot', 'chocolate', 'water', ...            17   
1  ['bad', 'experience', 'ever', 'casual', 'coffe...           115   
2  ['locate', 'site', 'old', 'spruce', 'street', ...            24   
3  ['enjoy', 'coffee', 'breakfast', 'twice', 'toa...           128   
4  ['love', 'toast', 'food', 'choice', 'fantastic...            77   

   num_of_verbs  avg_word_len  emotiveness  num_positive  num_negative  \
0             2      4.882353     0.416667             1             5   
1            17      5.452174     0.379747             4            12   
2             3      5.291667     0.600000             4             1   
3            15      5.664062     0.465116            

In [336]:
preprocessed_data=review8.copy()
preprocessed_data['lexical_diversity']=lex_diversity['lexical_diversity']
preprocessed_data['typo_ratio']=typo['typo_ratio']
print(preprocessed_data.head())
filtered_features=['rating','num_of_words', 'num_of_verbs','avg_word_len',
                   'emotiveness','num_positive','num_negative','sentiment',
                   'lexical_diversity','typo_ratio']

label_col='label'


   user_id  prod_id  rating  label  \
0     5044        0     1.0     -1   
1     5045        0     1.0     -1   
2     5046        0     3.0     -1   
3     5047        0     5.0     -1   
4     5048        0     5.0     -1   

                                          list_words  num_of_words  \
0  ['drink', 'bad', 'hot', 'chocolate', 'water', ...            17   
1  ['bad', 'experience', 'ever', 'casual', 'coffe...           115   
2  ['locate', 'site', 'old', 'spruce', 'street', ...            24   
3  ['enjoy', 'coffee', 'breakfast', 'twice', 'toa...           128   
4  ['love', 'toast', 'food', 'choice', 'fantastic...            77   

   num_of_verbs  avg_word_len  emotiveness  num_positive  num_negative  \
0             2      4.882353     0.416667             1             5   
1            17      5.452174     0.379747             4            12   
2             3      5.291667     0.600000             4             1   
3            15      5.664062     0.465116            

In [337]:
baseline_svm = LinearSVC()
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data[filtered_features],
                                                    preprocessed_data[label_col].values,
                                                    test_size=.3,
                                                    random_state=1)
resampler = SMOTE(random_state=24, k_neighbors=3)
print(len(X_train))
print(len(y_train))

426018
426018


In [341]:
%%time
svm_pipe = Pipeline([('scaler', scaler),
                     ('upsampler', resampler),
                     ('svc', baseline_svm)])
svm_pipe.fit(X_train, y_train)

Wall time: 4min




Pipeline(steps=[('scaler', StandardScaler()),
                ('upsampler', SMOTE(k_neighbors=3, random_state=24)),
                ('svc', LinearSVC())])

In [342]:
preds = svm_pipe.predict(X_test)

In [344]:
svm_pipe[2].coef_

array([[ 0.07793757,  0.29522536, -0.14246885, -0.01534178,  0.00939956,
         0.09379281, -0.08311056, -0.10143029, -0.00699425, -0.01839403]])

In [345]:
print(metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

          -1       0.18      0.66      0.28     24078
           1       0.91      0.53      0.67    158502

    accuracy                           0.55    182580
   macro avg       0.54      0.60      0.47    182580
weighted avg       0.81      0.55      0.62    182580



In [346]:
CM=confusion_matrix(y_test,preds)
print(CM)

[[15916  8162]
 [74439 84063]]
