In [1]:
""" Just a bunch of packages """
import pandas as pd
import numpy as np
import string
from time import time
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import random


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")

In [2]:
"""
In this Aggie Invent we only need to consider text data (description) so columns like NhtsaID, Make, Mode, etc are useless here.
"""

# Increase column width to let pandy read large text columns
pd.set_option('max_colwidth', 32000)
# Data of complaints
df = pd.read_excel("HondaComplaints.xlsx")
sw = pd.read_excel("afinn_sentiment_words.xlsx") # Sentiment words dictionary

In [6]:
# brief view of our main dataframe
df.head()

Unnamed: 0,description,NhtsaID,Make,Model,Year,State,abs,cruise,crash,mph,mileage
0,"CONSUMER STATES FIRESTONE TIRE (NO SIZE) EXPERIENCED A LEAK IN THE SIDEWALL ON THE RIGHT FRONT TIRE, CONSUMER IS ANGRY DUE TO GETTING A RUN AROUND FROM HONDA DEALER AND FIRESTONE DEALER, CONSUMER ENDED UP HAVING TO BUY THE TIRE.",560001,HONDA,CIVIC,2001,CA,N,N,N,31,85064.0
1,THE VEHICLE EXPERIENCES EXCESSIVE VIBRATION OF THE FRONT END WHILE DRIVING 50-70 MPH.,561194,HONDA,ACCORD,2001,CA,N,N,N,70,87186.0
2,"CONSUMER IS NOT HAPPY WITH THE DEALER DURING RECALL REPAIR, DEALER REFUSED RENTAL CAR, WHILE VEHICLE WAS BEING SERVICED.",562006,HONDA,CIVIC,2001,WV,N,N,N,31,81110.0
3,"CONSUMER IS NOT HAPPY WITH THE DEALER DURING RECALL REPAIR, DEALER REFUSED A RENTAL CAR, WHILE VEHICLE IS BEING REPAIRED.",562066,HONDA,CIVIC,2001,WV,N,N,N,31,81110.0
4,"CONSUMER STATES WHEN HE PROCEEDED TO BACK UP, THE VEHICLE ACCELERATED RAPIDLY CAUSING CONSUMER TO BACK INTO A CEMENT POLE, AFTER CRASHING, CONSUMER PUT VEHICLE INTO DRIVE AND IT ACCELERATED RAPIDLY, WHICH ALS CAUSED HIM TO REAR END A PARKED VEHICLE.",562091,HONDA,ACCORD,2001,PA,N,N,N,24,96792.0


In [5]:
# Brief view of sentiment dictionary
sw.head()

Unnamed: 0,sentiment,score
0,abandon,-2.0
1,abandoned,-2.0
2,abandons,-2.0
3,abducted,-2.0
4,abduction,-2.0


In [7]:
""" 
*Function Design*

"""
def my_analyzer(s):
    # Synonym List
    syns = {'veh': 'vehicle', 'car': 'vehicle', 'chev':'cheverolet', \
              'chevy':'cheverolet', 'air bag': 'airbag', \
              'seat belt':'seatbelt', "n't":'not', 'to30':'to 30', \
              'wont':'would not', 'cant':'can not', 'cannot':'can not', \
              'couldnt':'could not', 'shouldnt':'should not', \
              'wouldnt':'would not', 'straightforward': 'straight forward' }
    
    # Preprocess String s
    s = s.lower()
    # Replace special characters with spaces
    s = s.replace('-', ' ')
    s = s.replace('_', ' ')
    s = s.replace(',', '. ')
    # Replace not contraction with not
    s = s.replace("'nt", " not")
    s = s.replace("n't", " not")
    # Tokenize 
    tokens = word_tokenize(s)
    #tokens = [word.replace(',','') for word in tokens ]
    tokens = [word for word in tokens if ('*' not in word) and \
              ("''" != word) and ("``" != word) and \
              (word!='description') and (word !='dtype') \
              and (word != 'object') and (word!="'s")]
    
    # Map synonyms
    for i in range(len(tokens)):
        if tokens[i] in syns:
            tokens[i] = syns[tokens[i]]
            
    # Remove stop words
    punctuation = list(string.punctuation)+['..', '...']
    pronouns = ['i', 'he', 'she', 'it', 'him', 'they', 'we', 'us', 'them']
    others   = ["'d", "co", "ed", "put", "say", "get", "can", "become",\
                "los", "sta", "la", "use", "iii", "else","honda","vehicle"\
                "dealer","problem"]
    stop = stopwords.words('english') + punctuation + pronouns + others
    filtered_terms = [word for word in tokens if (word not in stop) and \
                  (len(word)>1) and (not word.replace('.','',1).isnumeric()) \
                  and (not word.replace("'",'',2).isnumeric())]
    
    # Lemmatization & Stemming - Stemming with WordNet POS
    # Since lemmatization requires POS need to set POS
    tagged_words = pos_tag(filtered_terms, lang='eng')
    # Stemming with for terms without WordNet POS
    stemmer = SnowballStemmer("english")
    wn_tags = {'N':wn.NOUN, 'J':wn.ADJ, 'V':wn.VERB, 'R':wn.ADV}
    wnl = WordNetLemmatizer()
    stemmed_tokens = []
    for tagged_token in tagged_words:
        term = tagged_token[0]
        pos  = tagged_token[1]
        pos  = pos[0]
        try:
            pos   = wn_tags[pos]
            stemmed_tokens.append(wnl.lemmatize(term, pos=pos))
        except:
            stemmed_tokens.append(stemmer.stem(term))
    return stemmed_tokens

def my_preprocessor(s):
    #Preprocess String s
    s=s.lower()
    #Replace special characters with spaces
    s=s.replace('-',' ')
    s=s.replace('_',' ')
    s=s.replace(',','. ')
    #Replace not contraction with not
    s=s.replace("'nt", " not")
    s=s.replace("n't", " not")
    return(s)

In [9]:
""" Sentiment Analysis """
print("\n**** Sentiment Analysis ****")

sentiment_dic = {}
for i in range(len(sw)): 
    sentiment_dic[sw.iloc[i][0]] = sw.iloc[i][1]
n = 0
for k,v in sentiment_dic.items(): 
    n += 1
    print(k, v) 
    if n>10:
        break
""" Create Term-Frequency Matrix for Sentiment Analysis """
cv = CountVectorizer(max_df=1.0,min_df=1, max_features=None,\
                     preprocessor=my_preprocessor, ngram_range=(1,2))
tf = cv.fit_transform(df['description'])
s_terms = cv.get_feature_names()
n_reviews = tf.shape[0]
n_terms = tf.shape[1]
print('{:.<22s}{:>6d}'.format("Number of Reviews",n_reviews)) 
print('{:.<22s}{:>6d}'.format("Number of Terms",n_terms))

""" Calculate Sentiment for each review """
min_sentiment = +5
max_sentiment = -5 
avg_sentiment, min, max = 0,0,0
min_list, max_list = [],[] 
sentiment_score = [0]*n_reviews 
for i in range(n_reviews):
    n_sw = 0
    term_list = tf[i].nonzero()[1] 
    if len(term_list)>0:
        for t in np.nditer(term_list):
            score = sentiment_dic.get(s_terms[t]) 
            if score != None:
                sentiment_score[i] += score * tf[i,t] 
                n_sw += tf[i,t]
    if n_sw>0:
        sentiment_score[i] = sentiment_score[i]/n_sw 
    if sentiment_score[i]==max_sentiment and n_sw>3:
        max_list.append(i)
    if sentiment_score[i]>max_sentiment and n_sw>3: 
        max_sentiment=sentiment_score[i]
        max = i 
        max_list = [i]
    if sentiment_score[i]==min_sentiment and n_sw>3: 
        min_list.append(i)
    if sentiment_score[i]<min_sentiment and n_sw>3: 
        min_sentiment=sentiment_score[i]
        min = i 
        min_list = [i]
    avg_sentiment += sentiment_score[i] 
avg_sentiment = avg_sentiment/n_reviews
print("\nCorpus Average Sentiment:{:>5.2f} ".format(avg_sentiment)) 
print("\nMost Negative Reviews with 4 or more Sentiment Words:")
for i in range(len(min_list)): 
    print("{:<s}{:>5d}{:<s}{:>5.2f}".format("   Description", min_list[i],\
          " Sentiment is ",min_sentiment))
print("\nMost Positive Reviews with 4 or more Sentiment Words:") 
for i in range(len(max_list)):
    print("{:<s}{:>5d}{:<s}{:>5.2f}".format("    Descrption",max_list[i],\
          " Sentiment is ",max_sentiment))


**** Sentiment Analysis ****
abandon -2.0
abandoned -2.0
abandons -2.0
abducted -2.0
abduction -2.0
abductions -2.0
abhor -3.0
abhorred -3.0
abhorrent -3.0
abhors -3.0
abilities 2.0
Number of Reviews.....  5330
Number of Terms.......132050

Corpus Average Sentiment:-1.08 

Most Negative Reviews with 4 or more Sentiment Words:
   Description  878 Sentiment is -2.75
   Description 1231 Sentiment is -2.75
   Description 2065 Sentiment is -2.75
   Description 2778 Sentiment is -2.75
   Description 3901 Sentiment is -2.75
   Description 4360 Sentiment is -2.75

Most Positive Reviews with 4 or more Sentiment Words:
    Descrption 4588 Sentiment is  1.89
