In [14]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.summarization import keywords, summarize
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import utils
import csv
import multiprocessing
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

product_info = pd.read_csv('../data/CatfoodProductInfo.csv')
reviews = pd.read_csv('../data/CatfoodReviewsInfo.csv')
df = reviews.join(product_info.set_index('product'), on='product',how='left')

In [15]:
df = df.dropna(axis=0,how='any')
brandnames = set(df['brand'].unique())

In [16]:
df.head(5)

Unnamed: 0,product,review_author,rating,review_text,helpful_rank,brand,price,oz_per_can,num_cans,price_per_oz
0,Fancy Feast Grilled Seafood Feast Variety Pack...,Rosie,5,I try and make this last an entire month as my...,1,Fancy Feast,$14.79,3.0,24.0,0.205417
1,Fancy Feast Grilled Seafood Feast Variety Pack...,VETTEGIRL,5,"HE LIKES VARIETY EVERY DAY, SO WE ROTATE FROM ...",2,Fancy Feast,$14.79,3.0,24.0,0.205417
2,Fancy Feast Grilled Seafood Feast Variety Pack...,Lovethebabies,5,My two cats are trying the grilled seafood Fan...,2,Fancy Feast,$14.79,3.0,24.0,0.205417
3,Fancy Feast Grilled Seafood Feast Variety Pack...,Suzi,5,I have a 1 year old cat I got from the Humane ...,2,Fancy Feast,$14.79,3.0,24.0,0.205417
4,Fancy Feast Grilled Seafood Feast Variety Pack...,Nicki,5,The last shipment is exactly right. Minou doe...,0,Fancy Feast,$14.79,3.0,24.0,0.205417


In [17]:
# standardize text
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    for brandname in brandnames:
        df[text_field] = df[text_field].str.replace(brandname.lower(),"")
        
    # keywordize into tokens
    try:
        kw = keywords(df[text_field].values[0],ratio=0.5,split=True)
        df[text_field] = kw
    except:
        df[text_field] = np.NaN
        
        
    return df

train_documents = []

test_documents = []
i = 0

# Label encode the names
le = preprocessing.LabelEncoder()
df['product_label']=le.fit_transform(df['product'])

# print(train_documents[0])

df_clean = standardize_text(df,'review_text')


In [18]:
df_clean

Unnamed: 0,product,review_author,rating,review_text,helpful_rank,brand,price,oz_per_can,num_cans,price_per_oz,product_label
0,Fancy Feast Grilled Seafood Feast Variety Pack...,Rosie,5,,1,Fancy Feast,$14.79,3.0,24.0,0.205417,123
1,Fancy Feast Grilled Seafood Feast Variety Pack...,VETTEGIRL,5,,2,Fancy Feast,$14.79,3.0,24.0,0.205417,123
2,Fancy Feast Grilled Seafood Feast Variety Pack...,Lovethebabies,5,,2,Fancy Feast,$14.79,3.0,24.0,0.205417,123
3,Fancy Feast Grilled Seafood Feast Variety Pack...,Suzi,5,,2,Fancy Feast,$14.79,3.0,24.0,0.205417,123
4,Fancy Feast Grilled Seafood Feast Variety Pack...,Nicki,5,,0,Fancy Feast,$14.79,3.0,24.0,0.205417,123
5,Fancy Feast Grilled Seafood Feast Variety Pack...,Eileen,3,,2,Fancy Feast,$14.79,3.0,24.0,0.205417,123
6,Fancy Feast Grilled Seafood Feast Variety Pack...,Vicki,5,,1,Fancy Feast,$14.79,3.0,24.0,0.205417,123
7,Fancy Feast Grilled Seafood Feast Variety Pack...,Josy,5,,1,Fancy Feast,$14.79,3.0,24.0,0.205417,123
8,Fancy Feast Grilled Seafood Feast Variety Pack...,JeebaTheChin215,1,,1,Fancy Feast,$14.79,3.0,24.0,0.205417,123
9,Fancy Feast Grilled Seafood Feast Variety Pack...,Hollybaby,4,,2,Fancy Feast,$14.79,3.0,24.0,0.205417,123
