### NLP Sentiment Analysis Exercise

In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re
import string
import nltk

In [81]:
# load data
fname = './data/Tweets.csv'
df = pd.read_csv(fname)

**Task:** Print the top 5 rows.

In [82]:
df.head(100)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,569910981868060673,negative,1.0000,Customer Service Issue,0.6863,Virgin America,,MerchEngines,,0,"@VirginAmerica Is it me, or is your website do...",,2015-02-23 09:25:41 -0800,"Los Angeles, CA",Arizona
96,569909224521641984,negative,1.0000,Customer Service Issue,0.6771,Virgin America,,ColorCartel,,0,@VirginAmerica I can't check in or add a bag. ...,,2015-02-23 09:18:42 -0800,"Austin, TX",Mountain Time (US & Canada)
97,569907336485019648,negative,1.0000,Can't Tell,0.6590,Virgin America,,MustBeSpoken,,0,@VirginAmerica - Let 2 scanned in passengers l...,,2015-02-23 09:11:12 -0800,,
98,569896805611089920,negative,1.0000,Flight Booking Problems,0.6714,Virgin America,,mattbunk,,0,@virginamerica What is your phone number. I ca...,,2015-02-23 08:29:21 -0800,"Sterling Heights, MI",Eastern Time (US & Canada)


**Task:** Use the `'text'` column to create an array with the name `'features'`.



In [83]:
features = df['text']

**Task:** Use `'airline_sentiment'` column to create an array with the name `'labels'`.

In [84]:
labels = df['airline_sentiment']

**Task:** Clean the text data in the `'features'` array.

    - Remove all the special characters.
    - Remove all single characters.
    - Remove single characters from the start.
    - Substituting multiple spaces with single space.
    - Converting all text to lowercase.

In [85]:
features = features.str.replace(r'[^a-zA-Z\d ]+', '', regex=True)
features = features.str.replace(r'(^| ).( |$)', '', regex=True)
features = features.str.replace(r' +', ' ', regex=True)
features = features.str.lower()

In [86]:
features.head()

0                     virginamerica what dhepburn said
1    virginamerica plus youve added commercials to ...
2    virginamericadidnt today must meanneed to take...
3    virginamerica its really aggressive to blast o...
4    virginamerica and itsreally big bad thing abou...
Name: text, dtype: object

**Task:** Instatiate TfidfVectorizer with following parameters:

    - max_features = 2500
    - min_df = 7
    - max_df = 0.8
    - stop_words = stopwords.words('english')
    
    


In [87]:
tfidf = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))

**Task:** Transform features with vectorizer. 

In [95]:
sparse_mat = tfidf.fit_transform(features)

**Task:**  Split the data.

In [96]:
X_train, X_test, y_train, y_test = train_test_split(sparse_mat, labels, test_size=0.2, random_state=4, stratify=labels)

In [97]:
rfc = RandomForestClassifier()
lgr = LogisticRegression()
nb = GaussianNB()

**Task:** Fit your classifier to data.

In [98]:
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [99]:
lgr.fit(X_train, y_train)

LogisticRegression()

In [101]:
nb.fit(X_train.toarray(), y_train)

GaussianNB()

**Task:** Predict X_test.

In [103]:
rfc.score(X_test, y_test)

0.7551229508196722

**Task:** Print confusion matrix.

In [105]:
metrics.confusion_matrix(y_test, rfc.predict(X_test))

array([[1708,   79,   48],
       [ 326,  248,   46],
       [ 162,   56,  255]])

In [106]:
metrics.confusion_matrix(y_test, lgr.predict(X_test))

array([[1701,  103,   31],
       [ 293,  285,   42],
       [ 120,   65,  288]])

In [108]:
metrics.confusion_matrix(y_test, nb.predict(X_test.toarray()))

array([[684, 473, 678],
       [ 52, 200, 368],
       [ 39,  61, 373]])

**Task:** Print accaccuracy_score.

In [111]:
print(metrics.classification_report(y_test, lgr.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.80      0.93      0.86      1835
     neutral       0.63      0.46      0.53       620
    positive       0.80      0.61      0.69       473

    accuracy                           0.78      2928
   macro avg       0.74      0.67      0.69      2928
weighted avg       0.77      0.78      0.76      2928

