## Import Libraries


In [223]:
import nltk
import os
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

## Confirm Directory

In [224]:
os.getcwd()

'/Users/gml/Desktop'

## Read Data from CSV into a Data Frame

In [225]:
data = pd.read_excel('hw4data.xlsx', header = 0)
print(data.head())

  Label_Sentiment                 Label_Reason   Label_Airline  \
0        positive                          NaN          United   
1        negative             Cancelled Flight  Virgin America   
2         neutral                          NaN      US Airways   
3        negative  Flight Attendant Complaints      US Airways   
4        negative  Flight Attendant Complaints          United   

                                                Text  
0  @united look at this beauty 😉 dc-10 united air...  
1  @VirginAmerica why Cancelled flight flight VX4...  
2  @USAirways  I'm on flight 623 from DIA to Onta...  
3  @USAirways  paid to upgrade to first class, we...  
4  @united Arriving at the airport 2 hours before...  


## Clean Data

In [226]:
data['Label_Reason']= data['Label_Reason'].replace(np.nan, 'No reason (Positive\\Nuetral Tweet)')

issues1 = []
fixes1 = []

issues1.append('\#[0-9]+')
issues1.append('http\S*')
issues1.append('flight[s]?\s?\d+')
issues1.append('[A-Z][0-9]+')
issues1.append('\.\s')

fixes1.append('')
fixes1.append('')
fixes1.append('flight')
fixes1.append('')
fixes1.append('')

issues2.append('&amp;')
issues2.append('-')
issues2.append(',')
issues2.append("'")
issues2.append('&gt;')

fixes2.append('')
fixes2.append('')
fixes2.append('')
fixes2.append('')
fixes2.append('')

In [227]:
cleandata = pd.Series(data.iloc[:,3])
count = 0
while (count < len(issues1)):
    #print(issues[count])
    cleandata = cleandata.str.replace(issues1[count], fixes1[count], regex=True)
    count = count + 1
count = 0
while (count < len(issues2)):
    #print(issues[count])
    cleandata = cleandata.str.replace(issues2[count], fixes2[count], regex=False)
    count = count + 1
print(cleandata.head())

0    @united look at this beauty 😉 dc10 united airl...
1    @VirginAmerica why Cancelled flight flight V? ...
2    @USAirways  Im on flight from DIA to Ontario t...
3    @USAirways  paid to upgrade to first class wen...
4    @united Arriving at the airport 2 hours before...
Name: Text, dtype: object


## Seperate out Examples

In [228]:
y1=data["Label_Sentiment"].values
y2=data["Label_Reason"].values
y3=data["Label_Airline"].values
x=cleandata.values
RowNames = " tweet#: " + data.index.map(str) + '(' + data["Label_Sentiment"] +')'

x1_train, x1_test, y1_train, y1_test, idx1_train, idx1_test = train_test_split(x, y1, RowNames, test_size=0.35, random_state=0)
x2_train, x2_test, y2_train, y2_test, idx2_train, idx2_test = train_test_split(x, y2, RowNames, test_size=0.35, random_state=0)
x3_train, x3_test, y3_train, y3_test, idx3_train, idx3_test = train_test_split(x, y3, RowNames, test_size=0.35, random_state=0)

## Create Overall Document Matrix From a Data Frame

In [229]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3),max_features=500,token_pattern=r'[^\s]+')
X = vectorizer.fit_transform(x)
ColumnNames=vectorizer.get_feature_names()

In [230]:
TDM=pd.DataFrame(X.toarray(),columns=ColumnNames, index = RowNames)
TDM.insert(0, "Label_Sentiment", list(data["Label_Sentiment"]), True) 
TDM.insert(1, "Label_Reason", list(data["Label_Reason"]), True) 
TDM.insert(2, "Label_Airline", list(data["Label_Airline"]), True) 
print(TDM)

                        Label_Sentiment                        Label_Reason  \
 tweet#: 0(positive)           positive  No reason (Positive\Nuetral Tweet)   
 tweet#: 1(negative)           negative                    Cancelled Flight   
 tweet#: 2(neutral)             neutral  No reason (Positive\Nuetral Tweet)   
 tweet#: 3(negative)           negative         Flight Attendant Complaints   
 tweet#: 4(negative)           negative         Flight Attendant Complaints   
...                                 ...                                 ...   
 tweet#: 2830(neutral)          neutral  No reason (Positive\Nuetral Tweet)   
 tweet#: 2831(positive)        positive  No reason (Positive\Nuetral Tweet)   
 tweet#: 2832(negative)        negative                         Late Flight   
 tweet#: 2833(positive)        positive  No reason (Positive\Nuetral Tweet)   
 tweet#: 2834(negative)        negative              Customer Service Issue   

                          Label_Airline  #  #fail  

## Create Test and Train TDMs

In [231]:
X_train = vectorizer.transform(x1_train)
TDM1=pd.DataFrame(X_train.toarray(), columns=ColumnNames, index = idx1_train)
TDM1.insert(0, "Label_Sentiment", list(y1_train), True) 
TDM1.insert(1, "Label_Reason", list(y2_train), True) 
TDM1.insert(2, "Label_Airline", list(y3_train), True) 
#print(TDM1)

In [232]:
X_test = vectorizer.transform(x1_test)
TDM2=pd.DataFrame(X_test.toarray(),columns=ColumnNames, index = idx1_test)
TDM2.insert(0, "Label_Sentiment", list(y1_test), True) 
TDM2.insert(1, "Label_Reason", list(y2_test), True) 
TDM2.insert(2, "Label_Airline", list(y3_test), True) 
#print(TDM2)

## Naive Bayes Modeling Part 1

In [233]:
nb_clf= MultinomialNB()

In [234]:
print('Worst words for Sentiment Clustinering')
sentiment_model = nb_clf.fit(X_train,y1_train)
feature_ranks_sentiment = sorted(zip(sentiment_model.feature_log_prob_[0], ColumnNames))
vn_sentiment_features = feature_ranks_sentiment[-20:]
print(vn_sentiment_features)

print('\n')
print('Worst words for Reason Clustinering')
reason_model = nb_clf.fit(X_train,y2_train)
feature_ranks_reason = sorted(zip(reason_model.feature_log_prob_[0], ColumnNames))
vn_reason_features = feature_ranks_reason[-20:]
print(vn_reason_features)

print('\n')
print('Top words for Airline Clustinering')
airline_model = nb_clf.fit(X_train,y3_train)
feature_ranks_reason = sorted(zip(reason_model.feature_log_prob_[0], ColumnNames))
vn_reason_features = feature_ranks_reason[:20]
print(vn_reason_features)

Worst words for Sentiment Clustinering
[(-5.003817232050475, 'hold'), (-5.003817232050475, 'hour'), (-4.984769037079779, 'flights'), (-4.966076904067627, 'late'), (-4.966076904067627, 'plane'), (-4.912009682797351, 'delayed'), (-4.894617940085483, '2'), (-4.877523506726182, '@jetblue'), (-4.877523506726182, 'time'), (-4.860716388409801, 'hours'), (-4.79617786727223, 'im'), (-4.750368331240936, 'customer'), (-4.664601509483511, 'service'), (-4.6509958574277315, 'just'), (-4.2365620793368075, '@southwestair'), (-4.2365620793368075, 'cancelled'), (-4.142876595259485, '@virginamerica'), (-3.8613107498251393, '@usairways'), (-3.244633933257032, 'flight'), (-2.386983039219926, '@united')]


Worst words for Reason Clustinering
[(-5.434813111136574, 'time'), (-5.434813111136574, 'use'), (-5.434813111136574, 'worst'), (-5.211669559822364, 'airline'), (-5.211669559822364, 'flights'), (-5.211669559822364, 'isnt'), (-5.211669559822364, 'let'), (-5.211669559822364, 'like'), (-5.211669559822364, 'se

In [235]:
sentiment_model = nb_clf.fit(X_train,y1_train)
print(sentiment_model.score(X_test,y1_test))
y1_pred = sentiment_model.fit(X_train, y1_train).predict(X_test)
cm=confusion_matrix(y1_test, y1_pred, labels=['positive','neutral','negative'])
print(cm)

0.7009063444108762
[[ 90  34  44]
 [ 19  84 107]
 [ 35  58 522]]


In [236]:
reason_model = nb_clf.fit(X_train,y2_train)
print(reason_model.score(X_test,y2_test))
y2_pred = reason_model.fit(X_train, y2_train).predict(X_test)
cm=confusion_matrix(y2_test, y2_pred, labels=list(dict.fromkeys(y2_train)))
print(cm)

0.5287009063444109
[[ 95   4   3  49   1   5  14  13   0   3   0]
 [  8  25   0  20   0   1   4   3   0   1   0]
 [  6   1   5   9   1   1   4   4   0   0   0]
 [ 36   4   5 286   5   5  17  12   0   8   0]
 [  7   1   1  23   2   0   7   3   0   2   0]
 [  0   0   1   9   0  37   4   2   0   2   0]
 [ 14   3   3  18   2   2  56   4   0   0   0]
 [  7   1   0  46   1   0   6  11   0   0   0]
 [  3   1   2   0   0   3   2   0   0   0   0]
 [ 17   0   1  11   1   2   3   2   0   8   0]
 [  0   1   0   2   0   0   0   1   0   0   0]]


In [237]:
airline_model = nb_clf.fit(X_train,y3_train)
print(airline_model.score(X_test,y3_test))
y3_pred = airline_model.fit(X_train, y3_train).predict(X_test)
cm=confusion_matrix(y3_test, y3_pred, labels=list(dict.fromkeys(y3_train)))
print(cm)

0.9798590130916415
[[522   1   2   0   2]
 [  1 128   0   0   2]
 [  4   0 162   0   0]
 [  1   1   1  67   1]
 [  3   0   1   0  94]]


## Naive Bayes with Cross Vailidation and Pipeline

In [238]:
sentiment_model_pipe = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1,3),max_features=250)),('nb', MultinomialNB())])
scores = cross_val_score(sentiment_model_pipe, x, y1, cv=10)
avg=sum(scores)/len(scores)
print(avg)

reason_model_pipe = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1,3),max_features=250)),('nb', MultinomialNB())])
scores = cross_val_score(reason_model_pipe, x, y2, cv=10)
avg=sum(scores)/len(scores)
print(avg)

airline_model_pipe = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1,3),max_features=250)),('nb', MultinomialNB())])
scores = cross_val_score(airline_model_pipe, x, y3, cv=10)
avg=sum(scores)/len(scores)
print(avg)

0.713217844042745
0.5546342095024397
0.9901193685858216


## Second time without Twitter Tags

In [239]:
## Uses regual Expressions to find extra texts to remove for later
issues3 = []
fixes3 = []

issues3.append('\@\w*')

fixes3.append('')

In [240]:
count = 0
while (count < len(issues3)):
    #print(issues[count])
    cleandata = cleandata.str.replace(issues3[count], fixes3[count], regex=True)
    count = count + 1
print(cleandata.head())

0     look at this beauty 😉 dc10 united airlines 😉 ...
1     why Cancelled flight flight V? one sec its de...
2      Im on flight from DIA to Ontario tomorrow mo...
3      paid to upgrade to first class went up to ad...
4     Arriving at the airport 2 hours before depart...
Name: Text, dtype: object


## Seperate out Examples

In [241]:
y1=data["Label_Sentiment"].values
y2=data["Label_Reason"].values
y3=data["Label_Airline"].values
x=cleandata.values
RowNames = " tweet#: " + data.index.map(str) + '(' + data["Label_Sentiment"] +')'


x1_train, x1_test, y1_train, y1_test, idx1_train, idx1_test = train_test_split(x, y1, RowNames, test_size=0.35, random_state=0)
x2_train, x2_test, y2_train, y2_test, idx2_train, idx2_test = train_test_split(x, y2, RowNames, test_size=0.35, random_state=0)
x3_train, x3_test, y3_train, y3_test, idx3_train, idx3_test = train_test_split(x, y3, RowNames, test_size=0.35, random_state=0)

#print(x3_train[-5:])
#print(idx3_train[-5:])

## Create Overall Document Matrix From a Data Frame

In [242]:
X = vectorizer.fit_transform(x)
ColumnNames=vectorizer.get_feature_names()
TDM=pd.DataFrame(X.toarray(),columns=ColumnNames, index = RowNames)
TDM.insert(0, "Label_Sentiment", list(data["Label_Sentiment"]), True) 
TDM.insert(1, "Label_Reason", list(data["Label_Reason"]), True) 
TDM.insert(2, "Label_Airline", list(data["Label_Airline"]), True) 
print(TDM.head())

                     Label_Sentiment                        Label_Reason  \
 tweet#: 0(positive)        positive  No reason (Positive\Nuetral Tweet)   
 tweet#: 1(negative)        negative                    Cancelled Flight   
 tweet#: 2(neutral)          neutral  No reason (Positive\Nuetral Tweet)   
 tweet#: 3(negative)        negative         Flight Attendant Complaints   
 tweet#: 4(negative)        negative         Flight Attendant Complaints   

                       Label_Airline  !  #  #fail  #united  #unitedairlines  \
 tweet#: 0(positive)          United  0  0      0        0                0   
 tweet#: 1(negative)  Virgin America  0  0      0        0                0   
 tweet#: 2(neutral)       US Airways  0  0      0        0                0   
 tweet#: 3(negative)      US Airways  0  0      0        0                0   
 tweet#: 4(negative)          United  0  0      0        0                0   

                      1  1.5  ...  yall  year  years  yes  yesterday

## Create Test and Train TDMs

In [243]:
X_train = vectorizer.transform(x1_train)
TDM1=pd.DataFrame(X_train.toarray(), columns=ColumnNames, index = idx1_train)
TDM1.insert(0, "Label_Sentiment", list(y1_train), True) 
TDM1.insert(1, "Label_Reason", list(y2_train), True) 
TDM1.insert(2, "Label_Airline", list(y3_train), True) 
#print(TDM1)

In [244]:
X_test = vectorizer.transform(x1_test)
TDM2=pd.DataFrame(X_test.toarray(),columns=ColumnNames, index = idx1_test)
TDM2.insert(0, "Label_Sentiment", list(y1_test), True) 
TDM2.insert(1, "Label_Reason", list(y2_test), True) 
TDM2.insert(2, "Label_Airline", list(y3_test), True) 
#print(TDM1.iloc[:,36].)
#34:73

## Naive Bayes Modeling Part 2

In [245]:
nb_clf= MultinomialNB()

In [246]:
print('Worst words for Sentiment Clustinering')
sentiment_model = nb_clf.fit(X_train,y1_train)
feature_ranks_sentiment = sorted(zip(sentiment_model.feature_log_prob_[0], ColumnNames))
vn_sentiment_features = feature_ranks_sentiment[-20:]
print(vn_sentiment_features)

print('\n')
print('Worst words for Reason Clustinering')
reason_model = nb_clf.fit(X_train,y2_train)
feature_ranks_reason = sorted(zip(reason_model.feature_log_prob_[0], ColumnNames))
vn_reason_features = feature_ranks_reason[-20:]
print(vn_reason_features)

print('\n')
print('Top words for Airline Clustinering')
airline_model = nb_clf.fit(X_train,y3_train)
feature_ranks_reason = sorted(zip(airline_model.feature_log_prob_[0], ColumnNames))
vn_reason_features = feature_ranks_reason[:20]
print(vn_reason_features)

Worst words for Sentiment Clustinering
[(-4.957815281854906, 'like'), (-4.936309076633942, 'help'), (-4.91525566743611, 'cancelled flightled'), (-4.894636380233374, 'flightled'), (-4.874433672915854, 'dont'), (-4.835212959762574, 'hold'), (-4.835212959762574, 'hour'), (-4.816164764791878, 'flights'), (-4.797472631779726, 'late'), (-4.797472631779726, 'plane'), (-4.7434054105094505, 'delayed'), (-4.726013667797582, '2'), (-4.708919234438281, 'time'), (-4.6921121161219, 'hours'), (-4.627573594984329, 'im'), (-4.581764058953035, 'customer'), (-4.49599723719561, 'service'), (-4.482391585139831, 'just'), (-4.067957807048907, 'cancelled'), (-3.076029660969131, 'flight')]


Worst words for Reason Clustinering
[(-5.351858133476067, 'night'), (-5.351858133476067, 'paid'), (-5.351858133476067, 'passengers'), (-5.351858133476067, 'really'), (-5.351858133476067, 'time'), (-5.351858133476067, 'use'), (-5.351858133476067, 'worst'), (-5.128714582161857, 'airline'), (-5.128714582161857, 'flights'), (-

In [247]:
sentiment_model = nb_clf.fit(X_train,y1_train)
print(sentiment_model.score(X_test,y1_test))
y1_pred = sentiment_model.fit(X_train, y1_train).predict(X_test)
cm=confusion_matrix(y1_test, y1_pred, labels=['positive','neutral','negative'])
print(cm)
print('positive','neutral','negative')

0.6868076535750252
[[ 83  18  67]
 [ 12  71 127]
 [ 36  51 528]]
positive neutral negative


In [248]:
reason_model = nb_clf.fit(X_train,y2_train)
print(reason_model.score(X_test,y2_test))
y2_pred = reason_model.fit(X_train, y2_train).predict(X_test)
cm=confusion_matrix(y2_test, y2_pred, labels=list(dict.fromkeys(y2_train)))
print(cm)
print(list(dict.fromkeys(y2_train)))

0.5347432024169184
[[ 95   3   4  48   2   7  12  13   0   3   0]
 [  7  26   0  20   0   1   5   1   0   1   1]
 [  7   1   6  12   1   1   3   0   0   0   0]
 [ 40   4   5 293   5   5  13   8   0   5   0]
 [  6   2   1  23   4   0   6   3   0   1   0]
 [  1   0   1  10   0  37   4   1   0   1   0]
 [ 12   3   3  22   2   2  55   2   0   1   0]
 [  5   1   0  47   3   0   7   8   0   1   0]
 [  3   1   2   0   0   3   2   0   0   0   0]
 [ 18   0   1  12   1   1   3   2   0   7   0]
 [  0   1   0   2   0   0   0   1   0   0   0]]
['Customer Service Issue', 'Lost Luggage', 'Flight Attendant Complaints', 'No reason (Positive\\Nuetral Tweet)', 'Bad Flight', 'Cancelled Flight', 'Late Flight', "Can't Tell", 'longlines', 'Flight Booking Problems', 'Damaged Luggage']


In [249]:
airline_model = nb_clf.fit(X_train,y3_train)
print(airline_model.score(X_test,y3_test))
y3_pred = airline_model.fit(X_train, y3_train).predict(X_test)
cm=confusion_matrix(y3_test, y3_pred, labels=list(dict.fromkeys(y3_train)))
print(cm)
print(list(dict.fromkeys(y3_train)))

0.5236656596173213
[[413  30  58   6  20]
 [ 79  31   5   4  12]
 [ 99   4  53   3   7]
 [ 46   1  14   5   5]
 [ 49  14  16   1  18]]
['United', 'US Airways', 'Virgin America', 'JetBlue', 'Southwest']


## Naive Bayes with Cross Vailidation and Pipeline

In [250]:
sentiment_model_pipe = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1,3),max_features=250)),('nb', MultinomialNB())])
scores = cross_val_score(sentiment_model_pipe, x, y1, cv=10)
avg=sum(scores)/len(scores)
print(avg)

reason_model_pipe = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1,3),max_features=250)),('nb', MultinomialNB())])
scores = cross_val_score(reason_model_pipe, x, y2, cv=10)
avg=sum(scores)/len(scores)
print(avg)

airline_model_pipe = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1,3),max_features=250)),('nb', MultinomialNB())])
scores = cross_val_score(airline_model_pipe, x, y3, cv=10)
avg=sum(scores)/len(scores)
print(avg)

0.68323345129359
0.552893377393155
0.544597544297776
