# Workflow for Analyzing Common Words in Negative Store Reviews
### Created by Eric Nutt

In [5]:
# Import necessary packages
import pandas as pd
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.collocations import *

In [47]:
stop_word = stopwords.words('english')
stop_word_upper = []
for word in stop_word:
    stop_word_upper.append(word.upper())

## Define function for reading Google Maps store reviews .csv into a simple pandas dataframe.

In [48]:
# Define function for importing csv to pandas dataframe

def import_review_csv(csv_path):
    """Imports a csv of store reviews, creates a dataframe, adds year column and indexes on it

    Paramaters
    ----------
    file_path : string
        Path to file

    Returns
    -------
    returned_data : dataframe
        Dataframe created from loaded csv
    """
    # Read csv as dataframe
    df = pd.read_csv(csv_path)
    # Convert date coloumn to datetime
    df['review_datetime_utc'] = pd.to_datetime(df['review_datetime_utc'])
    # Remove timestamp from Date and store it in a new column
    df['Date'] = df['review_datetime_utc'].dt.date
    # Set the index to the DATE column
    df2 = df.set_index('Date')
    # Keep only author_title, review_text, review_rating, review_datetime_utc
    df3 = df2[['author_title', 'review_text', 'review_rating']]
    # Drop rows with NaN values (not comment left with rating)
    df4 = df3.dropna(axis=0, how='any', thresh=None,
                     subset=None, inplace=False)
    # Convert index to datetime
    df4.index = pd.to_datetime(df4.index)
    # Parse dates for year index
    df5 = df4.sort_index()
    return df5

### Apply workflow to Dripping Springs HEB store.

In [49]:
# Define path to csv reviews for HEB Dripping Springs
heb_dstx_path = 'heb_dstx_reviews.csv'

heb_dpsp_tx_df = import_review_csv(heb_dstx_path)
heb_dpsp_tx_df.head()

Unnamed: 0_level_0,author_title,review_text,review_rating
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-10-07,Helen Gilliam,"I will give you the good news first, I have be...",2
2016-08-20,MaryClare Porter,I am very disapointed at the customer service ...,2
2017-05-09,michele Peel,FYI...I live in Wimberley but do most of my bu...,1
2017-05-30,Rick Lose,Too small and crowded,1
2017-06-29,billy smith,Really busy,1


### Apply workflow to Waxahachie HEB.

In [50]:
# Define path to csv reviews for HEB Dripping Springs
'''
heb_wax_path = 'heb_waxtx_reviews.csv'

heb_wax_tx_df = import_review_csv(heb_wax_path)
heb_wax_tx_df.head()
'''

"\nheb_wax_path = 'heb_waxtx_reviews.csv'\n\nheb_wax_tx_df = import_review_csv(heb_wax_path)\nheb_wax_tx_df.head()\n"

## Define function for generating common words list.
This function will generate a list of the most common words (ommiting stopwords) with a word count. I've applied this function to the dataframes for two HEB stores: Dripping Springs and Waxahachie.

In [51]:
# Write function to get most common words without stop words

# Define function for getting most common meaningful words
def most_common_words(store_df):
    """Reads review text column and gets most common words without stopwords

    Paramaters
    ----------
    store_df : pandas dataframe
        store dataframe

    Returns
    -------
    returned_data : list
        list of most common words in review text
    """
    # Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
    store_df['review_without_stopwords'] = store_df['review_text'].apply(
        lambda x: ' '.join([word for word in x.split() if word not in (stop_word) and word not in (stop_word_upper)]))
    # Get most common words from review text
    common_words = Counter(" ".join(
        store_df["review_without_stopwords"].str.lower()).split()).most_common(100)
    return common_words

In [52]:
# Test function on Dripping Springs df
heb_dpsp_df = most_common_words(heb_dpsp_tx_df)

In [53]:
# Test function on Waxahachie df
# most_common_words(heb_wax_tx_df)

### Summary & Conclusions
The workflow above reads a .csv of google maps store reviews into a dataframe with the columns: date of review, review author, review text, and review rating (stars). A function is then applied to that new dataframe to get the most common words (ommiting stopwords) for a more efficient analysis of the review text.


#### Dripping Springs, HEB
"Rude" shows up eight times in 46 reviews. One of HEB's most important values is "Heart". At HEB, people matter and are at the heart of every decision made. I've experienced exemplary customer service whenever I go into an HEB, and I know that instances of employees being rude to customers is the exception not the norm. However, I believe any and all customer concerns should be given their due diligence, so one might look further into these reviews. 

Furthermore, "Diane" shows up five times. Without speaking to Diane, I am unable to comment on the situation. That being said, their name showing up five times in one and two star reviews isn't necessarily a good thing. Perhaps reviewing the importance of HEB's values with Diane would be a step in the right direction.


#### Waxahachie, HEB
"Cake/Cakes/Cup" shows up a combined 34 times in 97 reviews. A one-off negative review of the bakery department could be chalked up to the employee or customer having a bad day, and not be indicative of a persistant concern. However, 34 times seems especially high, therefore this could, and probably should, be looked further into. 

In [74]:
# Tokenize reviews (natural language to numerical value)
# Library
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

# isolate reviews
def isolate_reviews(store_df):
    # Exclude stopwords
    store_df['review_without_stopwords'] = store_df['review_text'].apply(
        lambda x: ' '.join([word for word in x.split() if word not in (stop_word) and word not in (stop_word_upper)]))
    return store_df

def tokenize_review(review):
    review_sentences = sent_tokenize(review)
    ps = PorterStemmer() # reduce words to their root
    review_stemmed = []
    for word in review_sentences:
        review_stemmed.append(ps.stem(word))
    review_tokenized = []
    for word in review_stemmed:
        review_tokenized.append(word_tokenize(word))
    return review_tokenized

heb_dpsp_reviews = isolate_reviews(heb_dpsp_tx_df)

for review in heb_dpsp_reviews["review_without_stopwords"]:
    token_review = tokenize_review(review)
    print(token_review)
    print("")

[['give', 'good', 'news', 'first', ',', 'delighted', 'heb', 'indripping', 'springs', '.'], ['in', 'last', 'month', 'six', 'weeks', 'seen', 'steady', 'decline', 'produce', 'offered', '.'], ['it', 'appaling', 'went', 'yesterday', '10/6/15', '.'], ['short', 'supplies', ',', 'old', 'wrinkled', 'veggies', ',', 'chaos', 'trying', 'reah', 'wanted', '.'], ['every', 'week', 'go', 'buy', 'apples', ',', 'usually', 'six', 'different', 'variety', 'week', 'last', 'four', 'weeks', 'least', 'half', 'apples', 'bad', 'cut', 'them', '.'], ['they', 'cheap', 'consider', 'expensive', 'deer', 'food', '.'], ['you', 'stopped', 'carrying', 'taylor', 'farms', 'bag', 'salads', 'put', 'nowhere', 'near', 'good', '.'], ['thought', 'let', 'know', 'also', 'let', 'know', 'many', 'others', 'unhappy', '.'], ['we', 'always', 'bragged', 'heb', 'much', 'brag', 'now', '.'], ['another', 'thing', 'noticed', 'well', 'two', 'months', 'can', 'not', 'get', 'party', 'size', 'fritos', ',', 'go', 'walmart', 'those', ',', 'stop', 'com