In [11]:
import pandas as pd
from textblob import TextBlob

In [12]:
url = 'https://raw.githubusercontent.com/grantaguinaldo/final-project/master/data/McClean.csv'

In [13]:
df = pd.read_csv(url, encoding='latin-1')

In [14]:
df.head()

Unnamed: 0,cleaned_review,rude
0,im not a huge mcds lover but ive been to bette...,1
1,terrible customer service came in at pm and ...,1
2,first they lost my order actually they gave it...,0
3,i see im not the only one giving star only be...,0
4,well its mcdonalds so you know what the food i...,1


In [15]:
df['trim_text'] = ''
df['review_len'] = ''
df['sentiment_polarity'] = ''
df['sentiment_objectivity'] = ''

In [16]:
df.head()

Unnamed: 0,cleaned_review,rude,trim_text,review_len,sentiment_polarity,sentiment_objectivity
0,im not a huge mcds lover but ive been to bette...,1,,,,
1,terrible customer service came in at pm and ...,1,,,,
2,first they lost my order actually they gave it...,0,,,,
3,i see im not the only one giving star only be...,0,,,,
4,well its mcdonalds so you know what the food i...,1,,,,


In [19]:
except_list = []
for index, row in df.iterrows():
    try:
        df.set_value(index, 'trim_text', " ".join(row['cleaned_review'].split()))
        df.set_value(index, 'review_len', len(row['trim_text'].split(' ')))
        df.set_value(index, 'sentiment_polarity', TextBlob(row['trim_text']).sentiment.polarity)
        df.set_value(index, 'sentiment_objectivity', TextBlob(row['trim_text']).sentiment.subjectivity)
    except:
        except_list.append(row)

In [20]:
df.head()

Unnamed: 0,cleaned_review,rude,trim_text,review_len,sentiment_polarity,sentiment_objectivity
0,im not a huge mcds lover but ive been to bette...,1,im not a huge mcds lover but ive been to bette...,50,-0.328571,0.828571
1,terrible customer service came in at pm and ...,1,terrible customer service came in at pm and st...,80,-0.257143,0.521429
2,first they lost my order actually they gave it...,0,first they lost my order actually they gave it...,156,0.09375,0.529167
3,i see im not the only one giving star only be...,0,i see im not the only one giving star only bec...,22,0.0,1.0
4,well its mcdonalds so you know what the food i...,1,well its mcdonalds so you know what the food i...,143,-0.00936147,0.437175


In [23]:
df.drop('cleaned_review', axis=1, inplace=True)

In [25]:
df.to_csv('McClean.csv', index=False)

In [26]:
df.head()

Unnamed: 0,rude,trim_text,review_len,sentiment_polarity,sentiment_objectivity
0,1,im not a huge mcds lover but ive been to bette...,50,-0.328571,0.828571
1,1,terrible customer service came in at pm and st...,80,-0.257143,0.521429
2,0,first they lost my order actually they gave it...,156,0.09375,0.529167
3,0,i see im not the only one giving star only bec...,22,0.0,1.0
4,1,well its mcdonalds so you know what the food i...,143,-0.00936147,0.437175


In [154]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

nb = MultinomialNB()

min_df = 0.01
max_df = 0.95
vect = CountVectorizer(stop_words='english', min_df=min_df, max_df=max_df, ngram_range=(1, 2))

In [155]:
X = df['trim_text']
y = df['rude']

In [156]:
print(X.shape)
print(y.shape)

(1525,)
(1525,)


In [157]:
X_dtm = vect.fit_transform(X)
print(X_dtm.shape)

(1525, 758)


In [158]:
X_train, X_test, y_train, y_test = train_test_split(X_dtm, y, test_size=0.30, random_state=42)

In [159]:
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [160]:
y_pred_class = nb.predict(X_test)

In [161]:
confusion_matrix(y_test, y_pred_class)

array([[253,  55],
       [ 44, 106]])

In [166]:
target_names = ['not rude', 'rude',]
print(classification_report(y_test, y_pred_class, target_names=target_names))

             precision    recall  f1-score   support

   not rude       0.85      0.82      0.84       308
       rude       0.66      0.71      0.68       150

avg / total       0.79      0.78      0.79       458



In [167]:
X_tokens = vect.get_feature_names()
rude_features = nb.feature_count_[1, :]
not_rude_features = nb.feature_count_[0, :]

In [168]:
token_df = pd.DataFrame({'tokens': X_tokens, 
                         'rude_total': rude_features, 
                         'not_rude_total': not_rude_features}).set_index('tokens')

In [170]:
token_df.sort_values(by='not_rude_total', ascending=False).head()

Unnamed: 0_level_0,not_rude_total,rude_total
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1
mcdonalds,613.0,316.0
food,363.0,220.0
order,269.0,282.0
just,248.0,152.0
place,226.0,105.0
