In [18]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.probability import FreqDist
from nltk.corpus import stopwords


from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import f1_score


from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier



import warnings
warnings.filterwarnings("ignore") 


pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)


In [2]:
clean_df = pd.read_csv("data/clean_df.csv", index_col = 0)

In [3]:
clean_df.head(10)

Unnamed: 0,item_name,price,overall_rating,num_rating,rev_title,review_text,section,target
0,Wrangler Men's Unlined Shirt Jacket,12.0,4.0,7.0,['Texas winter ready!'],['I bought this last year and am going to buy ...,men's clothing,0
1,Wrangler Men's Relaxed Fit Jeans,12.0,4.5,5840.0,"['Wrangler is the best.', 'My Favorite Jeans',...","['I like wrangler, the pants is exactly for my...",men's clothing,0
2,Wrangler Big Men's Relaxed Fit Jean,15.0,4.3,3257.0,"['Jeans That Fit Just Right', 'Great jeans wou...","[""If you like Big Men's style wearing jeans th...",men's clothing,0
3,Wrangler Men's 5 Star Regular Fit Jean with Flex,12.0,4.5,259.0,"['Committed buyer', 'More Comfortable than Swe...",['My husband loves these jeans and goes throug...,men's clothing,0
4,Wrangler Men's and Big Men's 5 Star Relaxed Fi...,15.0,4.6,1490.0,['Saved money and faster delivery that Amazon'...,['Fit well Great price same jeans I was buying...,men's clothing,0
5,Champion Men's Powerblend Fleece Pullover Hoodie,22.0,4.6,1247.0,"['Perfect!', 'Comfy Time', '2020 tie dye trend...",['They were exactly what I was searching for! ...,men's clothing,1
6,Fruit of the Loom Men's Core Waffle Thermal Top,5.0,3.3,9.0,"['Love them', 'Love them - definitely a classi...",['These are very cozy! Body pair for me and my...,men's clothing,1
7,Champion Men's Powerblend Graphic Crew,22.5,4.7,377.0,"['Great sweater and price!', 'A good purchase'...",['I bought 2 of these for $19 each were anothe...,men's clothing,0
8,Free Assembly Men's Two-Pocket Flannel Shirt,18.0,4.9,22.0,"['Washes well. Soft. Good value.', 'Super su...",['My husband also likes this shirt. I washed a...,men's clothing,1
9,Lee Men’s Big & Tall Regular Fit Jeans,24.9,4.4,913.0,"['Great jeans', 'From 501s to Lee Jeans. Best ...",['I have bought Lee jeans for several years. T...,men's clothing,0


In [4]:
clean_df.rev_title[4]

'[\'Saved money and faster delivery that Amazon\', \'Very comfortable jeans\', \'Great fit - not too baggy, not too skinny!\', \'Very durable\', \'Pockets are ripped on everyone about eight months\', "Wrangler men\'s relaxed fit jeans", \'Tight in thigh, lighter color in front not solid\', \'Ripped within the first week.\', \'Poor Quality\', \'Mis-manufactured on purpose...STUPID\', \'Bad quality control.\', \'Wrangler jeans. Still the best.\', \'Not What I Wanted\', \'NOT A FAN\', \'Solid. Replaced all my old jeans with these.\', \'These are ok\', \'Good fit\', \'Practical and comfortable at a great price.\', "Wrangler doesn\'t miss", \'How they fit\']'

In [17]:

x = '[\'Saved money and faster delivery that Amazon\', \'Very comfortable jeans\', \'Great fit - not too baggy, not too skinny!\', \'Very durable\', \'Pockets are ripped on everyone about eight months\', "Wrangler men\'s relaxed fit jeans", \'Tight in thigh, lighter color in front not solid\', \'Ripped within the first week.\', \'Poor Quality\', \'Mis-manufactured on purpose...STUPID\', \'Bad quality control.\', \'Wrangler jeans. Still the best.\', \'Not What I Wanted\', \'NOT A FAN\', \'Solid. Replaced all my old jeans with these.\', \'These are ok\', \'Good fit\', \'Practical and comfortable at a great price.\', "Wrangler doesn\'t miss", \'How they fit\']'
' '.join(re.sub("([^0-9A-Za-z])"," ",x).split())

'Saved money and faster delivery that Amazon Very comfortable jeans Great fit not too baggy not too skinny Very durable Pockets are ripped on everyone about eight months Wrangler men s relaxed fit jeans Tight in thigh lighter color in front not solid Ripped within the first week Poor Quality Mis manufactured on purpose STUPID Bad quality control Wrangler jeans Still the best Not What I Wanted NOT A FAN Solid Replaced all my old jeans with these These are ok Good fit Practical and comfortable at a great price Wrangler doesn t miss How they fit'

In [6]:
## all funcitons live in the scripts folder
def clean_text(text):
    text.replace("\\n"," ")
    text =  ' '.join(re.sub("([^0-9A-Za-z])"," ",text).split())
    return text.lower()

In [7]:
clean_df["rev_title"] = clean_df.rev_title.apply(clean_text)

### Now cleaning review_text column

In [8]:
clean_df["review_text"] = clean_df["review_text"].apply(clean_text)

In [10]:
clean_df.review_text[8]

'my husband also likes this shirt i washed and put in dryer before wearing came out almost free of wrinkles no shrinking that he noticed nice soft plaid need more colors the material is super soft and comfortable and the shirt has a good weight to it if i compared it to paper this is the card stock quality got for my husband and he loves it and wears it all the time now great fit soft well made he loves the blue these shirts are a very good quality fits great happily surprised by the high quality of the fabric and make of this shirt soft perfect fit and nice weight perfect for fall and winter super comfy high quality looks great will be ordering more of these shirts great quality fit and design nice quality thick and warm what i thought was a light weight flannel shirt for florida weather i received a thick and heavy flannel jacket a better description is necessary ordered a couple of free assembly flannels and am really pleased with what i received for the price point the design sense

In [11]:
clean_df.head(6)

Unnamed: 0,item_name,price,overall_rating,num_rating,rev_title,review_text,section,target
0,Wrangler Men's Unlined Shirt Jacket,12.0,4.0,7.0,texas winter ready,i bought this last year and am going to buy an...,men's clothing,0
1,Wrangler Men's Relaxed Fit Jeans,12.0,4.5,5840.0,wrangler is the best my favorite jeans extreme...,i like wrangler the pants is exactly for my bo...,men's clothing,0
2,Wrangler Big Men's Relaxed Fit Jean,15.0,4.3,3257.0,jeans that fit just right great jeans would bu...,if you like big men s style wearing jeans thes...,men's clothing,0
3,Wrangler Men's 5 Star Regular Fit Jean with Flex,12.0,4.5,259.0,committed buyer more comfortable than sweatpan...,my husband loves these jeans and goes through ...,men's clothing,0
4,Wrangler Men's and Big Men's 5 Star Relaxed Fi...,15.0,4.6,1490.0,saved money and faster delivery that amazon ve...,fit well great price same jeans i was buying o...,men's clothing,0
5,Champion Men's Powerblend Fleece Pullover Hoodie,22.0,4.6,1247.0,perfect comfy time 2020 tie dye trend online o...,they were exactly what i was searching for nic...,men's clothing,1


In [None]:
def lemmatize_text(text):
     return [lemmatizer.lemmatize(word.replace(" ", "")) for word in tokenizer.tokenize(text)]

In [12]:
clean_df.to_csv("full_clean_df.csv")