# Clothing Reviews

In [1]:
#import nltk and everything else
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from string import punctuation

import pandas as pd

In [2]:
#find and read the file
filepath = "women_clothing_review.csv"
df = pd.read_csv(filepath, encoding="latin-1") #this file is encoded differently

df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [3]:
#we need the Sentiment Analyzer to do this
sid = SentimentIntensityAnalyzer()

## Cleaning Dataframe

In [4]:
#Here's all the empty reviews, we don't want these
df.loc[df["Review Text"].isnull() == True]

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
92,92,861,23,,,5,1,0,General Petite,Tops,Knits
93,93,1081,31,,,5,1,0,General,Dresses,Dresses
98,98,1133,50,,,5,1,0,General,Jackets,Outerwear
135,135,861,35,,,4,1,0,General Petite,Tops,Knits
142,142,1126,35,,,5,1,0,General,Jackets,Outerwear
...,...,...,...,...,...,...,...,...,...,...,...
23258,23258,862,35,,,4,1,0,General,Tops,Knits
23301,23301,862,52,,,5,1,0,General,Tops,Knits
23303,23303,823,46,,,5,1,0,General,Tops,Blouses
23470,23470,1104,39,,,5,1,0,General Petite,Dresses,Dresses


In [5]:
#so we take everything else
df2 = df.loc[df["Review Text"].isnull() == False]
df2.count()

Unnamed: 0                 22641
Clothing ID                22641
Age                        22641
Title                      19675
Review Text                22641
Rating                     22641
Recommended IND            22641
Positive Feedback Count    22641
Division Name              22628
Department Name            22628
Class Name                 22628
dtype: int64

In [6]:
#there are this many reviews
df2.shape

(22641, 11)

In [7]:
#just to be totally safe, we make sure everything is a string
df2["Review Text"] = df2["Review Text"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["Review Text"] = df2["Review Text"].astype(str)


## Sentiment Analysis

In [8]:
#this method converts a review, as a string, into a float representing the sentiment, -1 is bad, 1 is good, 0 is neutral
def reviewSentiment(review):
    
    #this is for reference later
    eng_stopwords = stopwords.words('english')
    
    #make the content of the review lowercase
    review = review.lower()
    
    #create a list of word tokens for that review
    tknz_review = word_tokenize(review)
    
    #punctuation aren't words; remove them
    for token in tknz_review:
        if token in punctuation:
            tknz_review.remove(token)
    
    #this new list will hold clean tokens, that is to say, not filler words
    clean_tokens = []
    
    #remove filler words, using eng_stopwords for reference, placing only useful words in the cleaned tokens
    for token in tknz_review:
        if token not in eng_stopwords:
            clean_tokens.append(token)
            
    #create a new string made up of just the cleaned tokens
    clean_review = ' '.join(clean_tokens)
    
    #get the polarity scores dictionary
    sid_rev = sid.polarity_scores(clean_review)
    
    #get sentiment polarity from the "compound" key in the sid_rev dictionary
    rev_comp = sid_rev['compound']
    
    #return the sentiment value for the review
    return rev_comp

In [9]:
#apply the method to all the reviews in the dataframe
df2['Review Sentiment'] = df2['Review Text'].apply(reviewSentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Review Sentiment'] = df2['Review Text'].apply(reviewSentiment)


In [10]:
#the sentiment scores are on the far right
df2.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Review Sentiment
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,0.8991
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,0.971
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,0.9062
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,0.9464
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,0.9117
