In [31]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag
from textblob import TextBlob
import pandas as pd
import matplotlib.pyplot as plt
import re

In [32]:
stopwords = nltk.corpus.stopwords.words("english")
data = pd.read_csv('all_chains_cs_reviews.csv')
data['stars'] = data['stars_y']
data = data.drop(['Unnamed: 0', 'business_id', 'city', 'address', 'state', 'postal_code', 'latitude', 'longitude', 'attributes', 'categories', 'hours', 'review_id', 'useful', 'funny', 'cool', 'date', 'is_open', 'stars_x', 'stars_y', 'review_count'], axis=1)
data.head()

Unnamed: 0,name,user_id,text,stars
0,Popeyes,cZA_G7kIkyIrR15EKXoVFw,Awful. Waited in line for 15-20 for a chicken...,1.0
1,Popeyes,zKMCLxQnAOXpHJIKMZCI_Q,This is my first time at the Hartsfield Airpor...,1.0
2,Popeyes,CpRBM-El-mqvbv93lYX5QA,They weren't ready for the return of the chick...,4.0
3,Popeyes,ui1vL68Ty9_aeKGtzJNSHg,I have being several times there but they neve...,1.0
4,Popeyes,Lk_clm7vulcrkvcBAyO6fg,I have been trying to get the spicy chicken sa...,1.0


In [33]:
# Function to clean the text
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text

# Cleaning the text in the review column
data['clean_reviews'] = data['text'].apply(clean)

In [34]:
# Tagging each word with the part of speech
part_of_speech_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def pos_tagger(input):
    tags = pos_tag(word_tokenize(input))
    tag_list = []
    for word, tag in tags:
        if word.lower() not in stopwords:
            tag_list.append(tuple([word, part_of_speech_dict.get(tag[0])]))
    return tag_list

data['part_of_speech'] = data['clean_reviews'].apply(pos_tagger)

In [35]:
#Lemmatize the words so that they are in similar formats for analysis
lemmatizer = WordNetLemmatizer()
def lemmatize(part_of_speech_data):
    resulting_lemma = " "
    for word, pos in part_of_speech_data:
        if not pos:
            lemma = word
            resulting_lemma = resulting_lemma + " " + lemma
        else:
            lemma = lemmatizer.lemmatize(word, pos=pos)
            resulting_lemma = resulting_lemma + " " + lemma
    return resulting_lemma

data['lemma'] = data['part_of_speech'].apply(lemmatize)

In [36]:
# calculate the subjectivity of the review
def getSubjectivity(review):
    return TextBlob(review).sentiment.subjectivity

# calculate the polarity of the review (results between -1 and 1)
def getPolarity(review):
    return TextBlob(review).sentiment.polarity

# analyze the polarity of the review (closer to -1 is more negative and closer to 1 is more positive)
def polarity_analysis(score):
    if score < -0.025:
        return 'Negative'
    elif score > 0.025:
        return 'Positive'
    else:
        return 'Neutral'

In [38]:
data['polarity'] = data['lemma'].apply(getPolarity) 
data['analysis'] = data['polarity'].apply(polarity_analysis)

data = pd.DataFrame(data[['name', 'stars', 'text', 'polarity', 'analysis']])

data.to_csv('/Users/lukevandy/Documents/python-workspace/Stat628_Module3/pos_and_neg_reviews.csv')
tb_counts = data.analysis.value_counts()
print(data.head())
print(tb_counts)

      name  stars                                               text  \
0  Popeyes    1.0  Awful.  Waited in line for 15-20 for a chicken...   
1  Popeyes    1.0  This is my first time at the Hartsfield Airpor...   
2  Popeyes    4.0  They weren't ready for the return of the chick...   
3  Popeyes    1.0  I have being several times there but they neve...   
4  Popeyes    1.0  I have been trying to get the spicy chicken sa...   

                                               lemma  polarity  analysis  
0    Awful Waited line chicken sandwich biscuit g... -0.800000  Negative  
1    first time Hartsfield Airport idea service H... -0.061905  Negative  
2    ready return chicken stop take order restaur...  0.079940  Positive  
3    several time never chicken sandwich problem ...  0.079592  Positive  
4    try get spicy chicken sandwich last three mo... -0.166667  Negative  
      name  stars                                               text  \
0  Popeyes    1.0  Awful.  Waited in line for