## Sentiment Analysis for Whispr

1. import data from google sheets
2. clean dataset and create synthetic variables
3. summarize dataset: how many records per category, reviews over time
4. evaluate sentiment of review, give confidence interval
5. calculate summary insights: average sentiment / subjectivity per item, reviews per item
6. compare against manual evaluation
7. export data to google sheets

In [59]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import gspread
import pygsheets
from oauth2client.service_account import ServiceAccountCredentials
from matplotlib import pyplot as plt
import seaborn as sns
import string

import nltk
from nltk import pos_tag_sents, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords, words

%matplotlib inline
sns.set_style('darkgrid')
pd.options.display.max_rows = 100

### 1a. Import data from GS using GSpread
- connect to google sheets API
- create spreadsheet and worksheet objects, explore GSpread library
- create dataframe of reviews

In [152]:
#1 define the scope of your access tokens
scope = ['https://www.googleapis.com/auth/drive','https://spreadsheets.google.com/feeds']

#2 after getting oauth2 credentials in a json, obtain an access token from google authorization server
#by creating serviceaccountcredentials and indicating scope, which controls resources / operations that an
#access token permits
creds = ServiceAccountCredentials.from_json_keyfile_name('client_secret.json', scope)

#3 log into the google API using oauth2 credentials
#returns gspread.Client instance
c = gspread.authorize(creds)

spreadsheet = c.open('UK Sentiment')
worksheet = spreadsheet.worksheet('WHotel_Sentiment')
records = worksheet.get_all_records()
df = pd.DataFrame(records)
df = df[['Contents','Sentiment','Topic','Location','Comment']]

### 1b. Import data from GS using pygsheets

In [187]:
#authorization in one step - read client_secret
gc = pygsheets.authorize(service_file='client_secret.json')
spreadsheet2 = gc.open('UK Sentiment')

#clean up workbook 
for item in spreadsheet2.worksheets():
    title = item.title
    if item.title not in ['UK_Reviews','WHotel_Sentiment','WHOTELS_analyzed']:
        worksheet2 = spreadsheet2.worksheet_by_title(str(item.title))
        spreadsheet2.del_worksheet(worksheet2)
        print('{} sheet deleted'.format(item.title))
        
worksheet2 = spreadsheet2.worksheet_by_title('WHotel_Sentiment')
records2 = worksheet2.get_all_records()
df2 = pd.DataFrame(records2)
df2 = df2[['Contents','Sentiment','Topic','Location','Comment']]

### 2. Simple sentiment analysis

In [63]:
#baseline sentiment analysis - use textblob polarity, compare accuracy
df['Sentiment_Category'] = df['Sentiment'].map({1: 'Positive',2:'Neutral',3:'Negative'})

def pos_neg(polarity):
    if polarity >= 0.1:
        return 'Positive'
    if polarity >= 0 and polarity < 0.1:
        return 'Neutral'
    else:
        return 'Negative'

df['Polarity'] = [TextBlob(x).polarity for x in df['Contents']]
df['Subjectivity'] = [TextBlob(x).subjectivity for x in df['Contents']]
df['Textblob_Score'] = df['Polarity'].apply(pos_neg)

df.groupby(['Sentiment_Category','Textblob_Score'])['Polarity'].agg({'mean':np.mean, 'count':len})

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  app.launch_new_instance()


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
Sentiment_Category,Textblob_Score,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,Negative,-0.229419,11.0
Negative,Neutral,0.003046,72.0
Negative,Positive,0.379133,55.0
Neutral,Neutral,0.028125,1.0
Positive,Negative,-0.4,1.0
Positive,Neutral,0.001145,14.0
Positive,Positive,0.425419,20.0


### 3. KNN Sentiment Analysis

In [64]:
#create three checks: stopwords, punctuation, english
mystop = stopwords.words('english')
punctuation = string.punctuation
englishwords = [x.lower() for x in words.words()]

#lemmatize words in comments
wnl = WordNetLemmatizer()
myblob = TextBlob(str(df['Contents'].values.tolist())).tokenize()
lemmatized = [wnl.lemmatize(x).lower() for x in myblob]

#create list of lemmatized words
finalwords = [word for word in lemmatized if word not in punctuation and word not in mystop and word in englishwords]

#for lemmatized words, create counts and polarity scores
counts = {x: finalwords.count(x) for x in finalwords}
word_df = pd.DataFrame(counts.items(), columns = ['word','count']).sort_values('count', ascending = False)
word_df['polarity'] = word_df['word'].apply(lambda x: TextBlob(x).polarity)
positives = word_df[word_df['polarity']>0].sort_values(['count','polarity'], ascending = False)
negatives = word_df[word_df['polarity']<0].sort_values(['count','polarity'], ascending = False)

In [65]:
toptenpos=positives.nlargest(20, columns='count')
toptenneg=negatives.nlargest(20, columns='count')

In [151]:
#lemmatize sentence
#tokenize a sentence, tag it with its pos tags
#for specific letters, convert to wn pos
#lemmatize each word according to its pos
#return a sentence of the lemmatized words
#use this to convert each of the records in 'contents'
#count the frequency of lemmatized words in contents
#evaluate polarity
#count frequency of top 10 lemmatized words in contents

lemmatizer = WordNetLemmatizer()
def nltk2wn(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None        
    
def lemmatize_sent(sentence):
    nltk_tagged = pos_tag([x.lower() for x in nltk.word_tokenize(sentence)])
    converted_tags = [(x[0], nltk2wn(x[1])) for x in nltk_tagged]
    lemmatized_sent = []
    for x in converted_tags:
        if x[1] is None:
            lemmatized_sent.append(x[0])
        else:
            lemmatized_sent.append(lemmatizer.lemmatize(x[0], pos = x[1]))                     
    final_sentence = ' '.join(lemmatized_sent)
    return final_sentence 

sentence = "And then the night comes #bali balitrio #seminyak #baliseminyak #islandofgod #seminyakbali #wbali #woobar #tanskin #beachtravellers #beachlover #beach #beachbabes #beaches #beachvacay #beachlovers #beachwaves #bikini #bikinigirls #wanderlust #wandersoul #woundedsoul #asianhotties #eurasianhotties #eurasianbabes #eurasianhotties #indonesian #indonesia #indonesiangirl #asiangirls"
lemmatize_sent(sentence)


'and then the night come # bali balitrio # seminyak # baliseminyak # islandofgod # seminyakbali # wbali # woobar # tanskin # beachtravellers # beachlover # beach # beachbabes # beach # beachvacay # beachlovers # beachwaves # bikini # bikinigirls # wanderlust # wandersoul # woundedsoul # asianhotties # eurasianhotties # eurasianbabes # eurasianhotties # indonesian # indonesia # indonesiangirl # asiangirls'

In [130]:
spreadsheet.del_worksheet(analysis)
spreadsheet.add_worksheet('WHOTELS_analyzed', rows = 200, cols = 10)
analysis = spreadsheet.worksheet('WHOTELS_analyzed')



In [58]:
from nltk import pos_tag_sents
from nltk.tokenize import word_tokenize, sent_tokenize

text = "The goal was to best the competition. His latest song was a personal best. Hence, he received the best song of the year award. He played best after a couple of martinis."
text_sentence_tokens = sent_tokenize(text)
#print(text_sentence_tokens)

text_word_tokens = []
for sentence_token in text_sentence_tokens:
    text_word_tokens.append(word_tokenize(sentence_token))
#print(text_word_tokens)

text_tagged = pos_tag_sents(text_word_tokens)
print(text_tagged[1])

[('His', 'PRP$'), ('latest', 'JJS'), ('song', 'NN'), ('was', 'VBD'), ('a', 'DT'), ('personal', 'JJ'), ('best', 'NN'), ('.', '.')]
