In [31]:
# This code was created to automatically parse online reviews
# for the Podium company. The code extracts topics of interest
# from the reviews, along with their sentiment.

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from gensim.summarization import summarize, keywords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models.word2vec import Word2Vec
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as mp
from string import punctuation
import seaborn as sns
import pandas as pd
import numpy as np
import itertools
import random
import string
import nltk
import re

%matplotlib inline

# Read data from csv file
df = pd.read_csv('/Users/degravek/Insight/project/podium_code/reviews10000.csv', header=0)
df.rename(columns={'Rating': 'rating', 'Review Text': 'text', 'Location Id': 'location',
                    'Publish Date': 'date', 'Industry': 'industry'}, inplace=True)

# For speed purposes, we can cut the dataframe down
df = df[:10000]

# Drop rows with missing values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Remove some punctuation when summarizing reviews
def process(text):
    result = text.replace('/', '').replace('\n', '')
    result = re.sub(r'\.+', '  ', result)
    result = re.sub(r'\!+', '  ', result)
    result = re.sub(r'(.)\1{2,}', r'\1', result)
    result = re.sub(r'\W+', ' ', result).strip()
    result = result + '.'
    return result

# Strip punctuation from the data
def strip_punctuation(text):
    result = ''.join(tmp for tmp in text if tmp not in punctuation)
    result = re.sub(' +',' ', result)
    result = result.lower().strip()
    return result

# Define the Porter stemmer in case we want to use it
porter = PorterStemmer()
def tokenizer_porter(text):
    result = [porter.stem(word) for word in text.split()]
    result = ' '.join(result)
    return result

# Define a function to remove stop words
stop = stopwords.words('english')
def rmstopwords(text):
    result = text.split()
    result = ' '.join(word for word in result if word not in stop)
    return result

# Define a function to lemmatize words
lem = WordNetLemmatizer()
def lemmatize(text):
    result = text.split()
    result = ' '.join(lem.lemmatize(word)for word in result if word not in stop)
    return result

# Define a function to break reviews into individual sentences
def tokenizetext(text):
    sentences = nltk.sent_tokenize(text)
    sentences = [[sent] for sent in sentences]
    return sentences

# Define a function to find n-grams quickly
# If looking for unigrams, make sure they're nouns
def ngrams(text, n):
    result = []
    text = text.split()
    if n==1:
        text = nltk.pos_tag(text)
        result = [word for word, pos in text if pos[0] == 'N']
    else:
        for i in range(len(text)-n+1):
            result.append('_'.join(text[i:i+n]))
    return result

# Define a function to extract noun-phrase chunks of text
# This chunking pattern looks for an optional series of
# adjectives followed by one or more nouns
def extract_candidate_chunks(text, grammar = 'CHUNK: {<JJ.*>*<NN.*>+}'):
    import itertools, nltk, string
    parser = nltk.RegexpParser(grammar)
    tagged_sents = [nltk.pos_tag(nltk.word_tokenize(text))]

    for chunk in tagged_sents:
        if not chunk:
            candidates = []
        else:
            candidates = []
            tree = parser.parse(chunk)
            for subtree in tree.subtrees():
                if subtree.label() == 'CHUNK':
                    candidates.append(('_'.join([word for (word, tag) in subtree.leaves()])))
    candidates = [word for word in candidates if word not in stop]
    return candidates

# Define a function to sort the aspects
# by weighted sum of sentiment
def SortData(input_df, rfilter=None):
    if rfilter:
        input_df = input_df[input_df['rating'].isin(rfilter)].copy()

    input_df['counts'] = input_df.groupby(['aspects'])['sentiment'].transform('count')
    group1 = input_df.groupby(['aspects'])['sentiment'].sum()
    group2 = input_df.groupby(['aspects'])['counts'].mean()
    group3 = input_df.groupby(['aspects'])['sentiment'].mean()
    sorted_df = pd.DataFrame()
    sorted_df['counts']     = group2
    sorted_df['frac']       = np.round(100*(group2/group2.sum()), 2)
    sorted_df['sent_mean']  = np.round(group3, 2)
    sorted_df['importance'] = np.round(group1/(group2**0.1), 2)
    sorted_df = sorted_df.sort_values('importance', ascending=False)
    sorted_df.reset_index(level=0, inplace=True)
    return sorted_df

# Define function to summarize
# reviews about certain aspects
def SummarizeReviews(input_df, aspect_list, n_statements):
    # Try to summarize the aspects
    star_rating, summary = [], []
    for i, aspect in enumerate(aspect_list):
        rating = input_df.groupby('aspects')['sentiment'].mean().sort_values(ascending=False)[aspect]
        star_rating.append((rating - (-1))*(5 - 1)/(1 - (-1)) + 1)

        # Try to process the text a little bit
        corpus = pd.DataFrame()
        corpus['text'] = input_df[(input_df['aspects']==aspect)].sort_values('sentiment', ascending=False)['context']
        #corpus = corpus.head(num).append(corpus.tail(num))
        corpus = corpus.sample(n=n_statements)
        corpus = list(corpus['text'].apply(process))
        print('ASPECT: ', aspect)
        print('STAR: ', star_rating[i])
        print('SUMMARY: ', corpus)
        print('\n')
    return corpus

# Add the path to where RAKE was downloaded
import sys
rake_path = '/Users/degravek/Downloads/RAKE-tutorial-master/'
sys.path.insert(0, rake_path)

# RAKE will look for key phrases with
# at least four characters, composed
# of at most 3 words, appearing in the
# test at least one time
import rake, operator
rake_object = rake.Rake(rake_path + 'SmartStoplist.txt', 4, 3, 1)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
# Define a function to sort the aspects
# by weighted sum of sentiment
def SortData(input_df, rfilter=None):
    if rfilter:
        input_df = input_df[input_df['rating'].isin(rfilter)].copy()

    input_df['counts'] = input_df.groupby(['aspects'])['sentiment'].transform('count')
    group1 = input_df.groupby(['aspects'])['sentiment'].sum()
    group2 = input_df.groupby(['aspects'])['counts'].mean()
    group3 = input_df.groupby(['aspects'])['sentiment'].mean()
    sorted_df = pd.DataFrame()
    sorted_df['counts']     = group2
    sorted_df['frac']       = np.round(100*(group2/group2.sum()), 2)
    sorted_df['sent_mean']  = np.round(group3, 2)
    sorted_df['importance'] = np.round(group1/(group2**0.1), 2)
    sorted_df = sorted_df.sort_values('importance', ascending=False)
    sorted_df.reset_index(level=0, inplace=True)
    return sorted_df

# Define function to summarize
# reviews about certain aspects
def SummarizeReviews(input_df, aspect_list, n_statements):
    # Try to summarize the aspects
    star_rating, summary = [], []
    for i, aspect in enumerate(aspect_list):
        rating = input_df.groupby('aspects')['sentiment'].mean().sort_values(ascending=False)[aspect]
        star_rating.append((rating - (-1))*(5 - 1)/(1 - (-1)) + 1)

        # Try to process the text a little bit
        corpus = pd.DataFrame()
        corpus['text'] = input_df[(input_df['aspects']==aspect)].sort_values('sentiment', ascending=False)['context']
        #corpus = corpus.head(num).append(corpus.tail(num))
        corpus = corpus.sample(n=n_statements)
        corpus = list(corpus['text'].apply(process))
        print('ASPECT: ', aspect)
        print('STAR: ', star_rating[i])
        print('SUMMARY: ', corpus)
        print('\n')
    return corpus

# Add the path to where RAKE was downloaded
import sys
rake_path = '/Users/degravek/Downloads/RAKE-tutorial-master/'
sys.path.insert(0, rake_path)

# RAKE will look for key phrases with
# at least four characters, composed
# of at most 3 words, appearing in the
# test at least one time
import rake, operator
rake_object = rake.Rake(rake_path + 'SmartStoplist.txt', 4, 3, 1)

# Define a function to extract keywords from the reviews.
# This function breaks each review into sentences.
def ProcessReviews(df, ptype):
    parse_type = ptype

    # Divide reviews into individual sentences
    sentences = df['text'].apply(tokenizetext)

    # Stick the sentences back into the dataframe
    df['sentlist'] = sentences
    d1, d2, d3 = [], [], []
    d4, d5, d6 = [], [], []

    # Initialize the sentiment vader analyzer
    sid = SentimentIntensityAnalyzer()

    # Loop over sentences and process them
    for i in range(0, df.shape[0]):
        sent_list = df['sentlist'][i]
        for sentence in sent_list:
            sent_raw = ''.join(sentence)
            sent_pro = strip_punctuation(sent_raw)
            sent_pro = rmstopwords(sent_pro)
            sent_pro = lemmatize(sent_pro)
            sentiment = sid.polarity_scores(sent_raw)['compound']
            if parse_type[0] == 'ngram':
                pos = ngrams(sent_pro, ptype[1])
            elif parse_type == 'chunk':
                pos = extract_candidate_chunks(sent_pro)
            elif parse_type == 'rake':
                pos = rake_object.run(sent_raw)
                pos = ['_'.join(word[0].split()) for word in pos]
            for j in pos:
                d1.append(df['date'][i])
                d2.append(df['location'][i])
                d3.append(df['rating'][i])
                d4.append(j),
                d5.append(sentiment)
                d6.append(sent_raw)

    # Put everything in a dataframe
    processed_df = pd.DataFrame()
    processed_df['date']      = d1
    processed_df['location']  = d2
    processed_df['rating']    = d3
    processed_df['aspects']   = d4
    processed_df['sentiment'] = d5
    processed_df['context']   = d6

    # Remove any entry where the sentence
    # was determined to be neutral
    processed_df = processed_df[(processed_df['sentiment'] != 0)]
    return processed_df

In [34]:
punctuation = '!"#?$%&\'()’—*+,-./:;<=>@[\\]^_`{|}~'
def process_text(text):
    result = text.replace('\n', ' ').replace('\\n', ' ')                 # remove new
    result = re.sub(r'(.x\S+)', '', result)                              # remove hex
    result = re.sub('((www\S+)|(http\S+))', '', result)                  # remove links
    result = re.sub(r'[-+]?\d*[.,]\d+|\d+', 'number', result)            # numbers
    result = re.sub(r'(\w)(\1{2,})', r'\1', result)                      # white space
    result = ''.join(word for word in result if word not in punctuation) # punctuation
    result = re.sub(r' +', ' ', result).lower().strip()                  # lowercase
    return result

In [35]:
df.head()

Unnamed: 0,location,rating,text,date,industry
0,7684,5.0,Sales staff are very professional! Anne Sumle...,2017-01-14 12:00:00,automotive
1,7684,5.0,My experience at Jones PreOwned Superstore was...,2017-01-06 12:00:00,automotive
2,7684,5.0,I worked with Ashley from Internet sales & Joe...,2017-01-05 12:00:00,automotive
3,7684,5.0,Great Experience\nHad an overall great experie...,2016-12-29 0:00:00,automotive
4,7684,4.2,I've been a Jones customer for more than 13 ye...,2016-12-27 12:00:00,automotive


In [45]:
vectorizer = TfidfVectorizer(min_df=10, max_features=10000, ngram_range=(1, 2))
vz = vectorizer.fit_transform(list(df['text_pro']))

In [46]:
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [47]:
tfidf.sort_values(by=['tfidf'], ascending=True).tail(30)

Unnamed: 0,tfidf
specifically,7.645446
everytime,7.645446
far the,7.645446
and tell,7.645446
they knew,7.645446
for vehicle,7.645446
courtesy and,7.645446
their web,7.645446
the rear,7.645446
for something,7.645446


In [48]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=0)
svd_tfidf = svd.fit_transform(vz)

In [49]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 8461
[t-SNE] Computed conditional probabilities for sample 2000 / 8461
[t-SNE] Computed conditional probabilities for sample 3000 / 8461
[t-SNE] Computed conditional probabilities for sample 4000 / 8461
[t-SNE] Computed conditional probabilities for sample 5000 / 8461
[t-SNE] Computed conditional probabilities for sample 6000 / 8461
[t-SNE] Computed conditional probabilities for sample 7000 / 8461
[t-SNE] Computed conditional probabilities for sample 8000 / 8461
[t-SNE] Computed conditional probabilities for sample 8461 / 8461
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.199312
[t-SNE] Error after 300 iterations: 1.199312


In [50]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [51]:
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="tf-idf clustering of the news",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

In [52]:
tfidf_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y'])
tfidf_df['text'] = df['text']

In [53]:
from bokeh.models import ColumnDataSource
plot_tfidf.scatter(x='x', y='y', source=ColumnDataSource(tfidf_df))
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"text": "@text"}
show(plot_tfidf)

In [37]:
df = df.loc[df['rating']<=3]

In [38]:
sentences = df['text'].apply(tokenizetext)

In [39]:
a = []
for t in sentences:
    for j in t:
        a.append(''.join(j))

In [40]:
dff = pd.DataFrame()
dff['text'] = a

In [41]:
dff.shape

(8461, 1)

In [42]:
df=dff.copy()

In [43]:
df['text_pro'] = df['text'].apply(process_text)

In [44]:
df.head()

Unnamed: 0,text,text_pro
0,I test drove truck and rear brakes where holdi...,i test drove truck and rear brakes where holdi...
1,We bought a used Passat from Jone's.,we bought a used passat from jones
2,"It was shipped in from a ""buddy's"" dealership ...",it was shipped in from a buddys dealership and...
3,When we picked it up it was night time and I d...,when we picked it up it was night time and i d...
4,I also had transmission issues and a few other...,i also had transmission issues and a few other...
