# Processing Reviews
This notebook processes the results from `all_reviews.p` by splitting reviews into sentences, filtering to keep only sentences containing lemmatizations of words in the flagged and replacements word lists. Each sentence is also tokenized. Final output in `master_df.p`

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import os
import pickle as pkl
from tqdm import tqdm_notebook as tqdm

import nltk
#nltk.download('wordnet')
import random
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk import tokenize

import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Points to data directory
path = "../data/"

## Read in Reviews

In [3]:
all_reviews = pkl.load(open(os.path.join(path, 'all_reviews.p'),'rb'))
all_reviews = all_reviews[1:]
print(len(all_reviews))
#all_reviews

111092


## Read in Replacement Words 
(Reviews that contain replacement words are no longer used in the current model implementation, but keeping this here for possible future use)

In [5]:
replacement_words_single_no_overlap = pd.read_csv(os.path.join(path, 'replacements_minus_flag.csv'),index_col=0)
rwsno = list(replacement_words_single_no_overlap.rep_minus_flag)

## Read in Flagged Words

In [4]:
flagged_words_single = pkl.load(open(os.path.join(path, 'flagged_words_single.p'),'rb'))

## Lemmatize Flagged and Replacement Word Lists

In [11]:
replacement_words_lemmatized = [wordnet_lemmatizer.lemmatize(w) for w in rwsno]
flagged_words_lemmatized = [wordnet_lemmatizer.lemmatize(w) for w in flagged_words_single]

In [12]:
replacement_words_set = set(replacement_words_lemmatized)
flagged_words_set = set(flagged_words_lemmatized)

## Generate Dataframe

In [13]:
flagged_words_set = [word.lower() for word in flagged_words_set]
replacement_words_set = [word.lower() for word in replacement_words_set]

In [None]:
columns = ['review','flagged_word','flagged_index','problematic']
df = pd.DataFrame(columns=columns)


flagged_count = 0
replacement_count = 0

for review in tqdm(all_reviews):
    if type(review)==float:
        continue
    review_split = tokenize.sent_tokenize(review)
    review_tokenized = [tokenize.word_tokenize(review_split[i]) for i in range(len(review_split))]
    review_tokenized = [[w.lower() for w in review_sent] for review_sent in review_tokenized] 
    review_lemmatized = [[wordnet_lemmatizer.lemmatize(w) for w in review_sent] for review_sent in review_tokenized] 
    
    for i in range(len(review_lemmatized)):
        for j in range(len(review_lemmatized[i])): 
            if review_lemmatized[i][j] in flagged_words_set:
                entry = {'review': review_tokenized[i],'flagged_word':review_tokenized[i][j],'flagged_index':j,'problematic':1}
                df = df.append(entry,ignore_index=True)
            elif review_lemmatized[i][j]  in replacement_words_set:
                entry = {'review': review_tokenized[i],'flagged_word':review_tokenized[i][j],'flagged_index':j,'problematic':0}
                df = df.append(entry,ignore_index=True)

In [17]:
# write python dict to a file
output = open(os.path.join(path, 'master_df.p'), 'wb')
pkl.dump(df, output)
output.close()

In [18]:
len_reviews = [len(sentence) for sentence in df["review"].to_list()]

## Determining Optimal Sentence Length Cut-off

In [19]:
np.max(len_reviews)

174

In [21]:
plt.hist(len_reviews, bins = 10)

(array([1.14031e+05, 2.27360e+04, 2.11900e+03, 3.48000e+02, 7.00000e+01,
        3.80000e+01, 1.00000e+01, 0.00000e+00, 7.00000e+00, 3.00000e+00]),
 array([  1. ,  18.3,  35.6,  52.9,  70.2,  87.5, 104.8, 122.1, 139.4,
        156.7, 174. ]),
 <a list of 10 Patch objects>)