In [315]:
import pandas as pd
import spacy
from tqdm import tqdm
import spacy
#! python -m pip install -U pycld2
import pycld2 as cld2

In [316]:
"""
determines if sentence language is english
"""
def is_english(sentence):
    if (len(sentence) < 3):
        return False
    _, _, details = cld2.detect(sentence)
    if (details[0][0] != "ENGLISH"):
        #print(sentence)
        return False
    return True

In [318]:
"""
Input: 
    Text file containing all the generated fake reviews from GPT2 fine tuned model
Output:
    Saves a 'fake_reviews.csv' in the current directory that labels every review as fake
    
Fake reviews are filtered to remove \n symbol and the GTP2 model 
included some lines containing only equal signs which are also removed.
"""
def create_df_from_txt(path):
    
    review_list = []
    substr = "======"
    with open(path, 'r') as generated_reviews:
        for line in generated_reviews:
            #append all lines that are not equal signs
            if substr not in line:
                #remove newline token with [:-1]
                review_list.append(line[:-1])
                
    df = pd.DataFrame(review_list, columns=['Review'])
    df['Real'] = 0
    df.dropna(inplace=True)
    
    df.to_csv("fake_reviews.csv", index=False)
    print("Saved fake_reviews.csv")
    return df

In [319]:
generate_reviews_path = "Data/gpt_2_gen_texts.txt"
d = create_df_from_txt(generate_reviews_path)

Saved fake_reviews.csv


In [320]:
real_reviews = pd.read_csv("Data/real_reviews.csv")
fake_reviews = pd.read_csv("Data/fake_reviews.csv")
fake_reviews.dropna(inplace=True)
real_reviews.dropna(inplace=True)

In [321]:
real_reviews.Review.values[0]

'An appetizer, 2 HUGE dinners, and beer for 35 bucks.  Yes.  Lots to choose from on the menu.  Good service.  We will be back.'

In [305]:
nlp = spacy.load("en_core_web_sm",disable=["tagger", "parser"])
"""
Returns the lemma form of all words in the given sentence, also removes any non alphabetic words, ie punctuation
"""
def get_lemmatized_words(text):
    doc = nlp(text)
    sentence = []
    for token in doc:
        if (token.lemma_.isalpha()):
            sentence.append(token.lemma_)
    return sentence

def preprocess(data):
    res = []
    for rev in data.Review.values:
        tmp = " ".join(get_lemmatized_words(rev))
        #if its a longer sentence check the language
        if (len(tmp) > 40):
            if (is_english(tmp) == True):
                res.append(tmp)
        else:
            res.append(tmp)
    return res

In [322]:
f = preprocess(fake_reviews)
r = preprocess(real_reviews)

In [329]:
real = pd.DataFrame(r, columns=['Review'])
real['Real'] = 1
fake = pd.DataFrame(f, columns=['Review'])
fake['Real'] = 0

In [334]:
#50% real 50% fake in order to avoid biased data
real = real.iloc[0:fake.shape[0]]
assert(real.shape == fake.shape)

In [335]:
merged_df = pd.concat([fake, real], ignore_index=True)
merged_df.to_csv("classifier_data.csv", index=False)