# US Airline Sentiment Analysis using Twitter Data

<b> Project Description</b>

<em> Twitter data was scraped from February of 2015 and contributors were asked to classify positive, negative, and neutral tweets.Next we are training our model on this data to classify future tweets into relevant categories.As part of this project we are tokenizing tweets into meaningful words and based on these words we are training our model.</em>

# PHASE 1

In [1]:
%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import re
#nltk.download()
data = pd.read_csv('.\Data\Tweets.csv')
# Data Preprocessing
data['text']=data['text'].map(lambda x: re.sub("^@[^\s]+\s","",x))
def getHashtag(x):
    g=re.match("^[^#]+#([^\s]+).*",x)
    if g:
        return g.group(1)
    else:
        return ""

data['hashtags']=data['text'].map(getHashtag)

data['hashtags']=data['hashtags'].str.lower() 

from nltk.corpus import stopwords
def data_preprocess( raw_review ):
    # Function for data preprocessing like removing urls, removing special charachters, stop words etc.
    review_text = re.sub('((www\S+)|(http\S+))'," ", raw_review)
           
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    
    words = letters_only.lower().split()                             
    
    stops = set(stopwords.words("english"))                  
    
    meaningful_words = [w for w in words if not w in stops]   
    
    return( " ".join(meaningful_words))   

Reviews=[]
for i in range(0, len(data)):
    Reviews.append(data_preprocess(data['text'].tolist()[i]))
#print(Reviews)
vectorizer = CountVectorizer(min_df=1,stop_words='english')
Reviews
X = vectorizer.fit_transform(Reviews)
#print(X)
#print(X.toarray())
#print(X.shape)
#print(X[1,0:1])
vocab = np.array(vectorizer.get_feature_names())

# PHASE 2 : Project - Evaluation

# PERFORMANCE MEASURE : ACCURACY

We have selected Accuracy as our performance measure for various classifiers as we have multiclass problem for which best performance measure we can use is accuracy only. 

In [2]:
print('DATA SHAPE : vectorized %d tweets(instances). found %d terms(features).' % (X.shape[0], X.shape[1]))

DATA SHAPE : vectorized 14640 tweets(instances). found 11626 terms(features).


In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

Baseline : DummyClassifier() 

In [4]:
base_clf = DummyClassifier(strategy='most_frequent',random_state=0)
Acc_Score = cross_val_score(base_clf,X,data['airline_sentiment'], scoring='f1_micro',cv=10)
print("Accuracy for predicting majority class %0.3f" % np.mean(Acc_Score))
base_clf = DummyClassifier(strategy='stratified',random_state=0)
Acc_Score = cross_val_score(base_clf,X,data['airline_sentiment'], scoring='accuracy',cv=10)
print("Accuracy for random prediction %0.3f" % np.mean(Acc_Score))

Accuracy for predicting majority class 0.627
Accuracy for random prediction 0.460


# MODEL EVALUATION

Model 1 : LogisticRegression

In [5]:
Y = data['airline_sentiment']
lr_clf = LogisticRegression(random_state = 2)
lr_clf.fit(X,Y)
#predicted = cross_val_predict(clf,X,Y, cv=10)
#print(predicted)
print("LOGISTIC REGRESSION ACCURACIES FOR VARIOUS EXPERIMENTS:")
Acc_Score = cross_val_score(lr_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
lr_clf = LogisticRegression(penalty = 'l2',C=1,solver = 'lbfgs', multi_class = 'multinomial')
lr_clf.fit(X,Y)
Acc_Score = cross_val_score(lr_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
lr_clf = LogisticRegression(penalty = 'l2',C=0.5,solver = 'lbfgs', multi_class = 'multinomial',warm_start=True)
lr_clf.fit(X,Y)
Acc_Score = cross_val_score(lr_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
lr_clf = LogisticRegression(penalty = 'l2',C=10,solver = 'lbfgs', multi_class = 'multinomial')
lr_clf.fit(X,Y)
Acc_Score = cross_val_score(lr_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
lr_clf = LogisticRegression(penalty = 'l2',C=0.5,solver = 'newton-cg', multi_class = 'multinomial')
lr_clf.fit(X,Y)
Acc_Score = cross_val_score(lr_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
lr_clf = LogisticRegression(penalty = 'l2',C=8,solver = 'newton-cg', multi_class = 'multinomial')
lr_clf.fit(X,Y)
Acc_Score = cross_val_score(lr_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
lr_clf = LogisticRegression(penalty = 'l2',C=0.5,solver = 'sag', multi_class = 'multinomial')
lr_clf.fit(X,Y)
Acc_Score = cross_val_score(lr_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
lr_clf = LogisticRegression(penalty = 'l2',C=0.01,solver = 'sag', multi_class = 'multinomial')
lr_clf.fit(X,Y)
Acc_Score = cross_val_score(lr_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
lr_clf = LogisticRegression(penalty = 'l2',C=1.5,solver = 'sag', multi_class = 'multinomial', max_iter = 200)
lr_clf.fit(X,Y)
Acc_Score = cross_val_score(lr_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))

LOGISTIC REGRESSION ACCURACIES FOR VARIOUS EXPERIMENTS:
0.762
0.757
0.762
0.737
0.761
0.739
0.762
0.689
0.754


Model 2 : MultinomialNB

In [6]:
nb_clf = MultinomialNB()
nb_clf.fit(X, Y)
print("MULTINOMIAL NB ACCURACIES FOR VARIOUS EXPERIMENTS:")
Acc_Score = cross_val_score(nb_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
#print(clf.class_count_)
#print(clf.feature_count_)
nb_clf = MultinomialNB(alpha=0.5)
nb_clf.fit(X, Y)
Acc_Score = cross_val_score(nb_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
nb_clf = MultinomialNB(alpha = 0.5,fit_prior=False)
nb_clf.fit(X, Y)
Acc_Score = cross_val_score(nb_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
nb_clf = MultinomialNB(alpha=0.1)
nb_clf.fit(X, Y)
Acc_Score = cross_val_score(nb_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))
nb_clf = MultinomialNB(alpha=10)
nb_clf.fit(X, Y)
Acc_Score = cross_val_score(nb_clf,X,Y, scoring='accuracy',cv=10)
print("%0.3f" % np.mean(Acc_Score))

MULTINOMIAL NB ACCURACIES FOR VARIOUS EXPERIMENTS:
0.741
0.740
0.721
0.729
0.661


Model 3 : DecisionTree

In [7]:
#Setting No Parameters in Decision Tree Classifier
dt_clf = DecisionTreeClassifier()
dt_clf = dt_clf.fit(X, Y)
print("DECISION TREE ACCURACIES FOR VARIOUS EXPERIMENTS:")
dt_acc_score = cross_val_score(dt_clf, X, Y , scoring='accuracy', cv=10)
print("%0.3f" % np.mean(dt_acc_score))
#random_state = 0
dt_clf = DecisionTreeClassifier(criterion = "entropy")
dt_clf = dt_clf.fit(X, Y)
dt_acc_score = cross_val_score(dt_clf, X, Y , scoring='accuracy', cv=10)
print("%0.3f" % np.mean(dt_acc_score))
dt_clf = DecisionTreeClassifier(criterion = "entropy", max_features = 2500)
dt_clf = dt_clf.fit(X, Y)
dt_acc_score = cross_val_score(dt_clf, X, Y , scoring='accuracy', cv=10)
print("%0.3f" % np.mean(dt_acc_score))
dt_clf = DecisionTreeClassifier(criterion = "entropy", splitter = "random", max_depth = 15)
dt_clf = dt_clf.fit(X, Y)
dt_acc_score = cross_val_score(dt_clf, X, Y , scoring='accuracy', cv=10)
print("%0.3f" % np.mean(dt_acc_score))
dt_clf = DecisionTreeClassifier(criterion = "gini", splitter = "random", max_depth = 20)
dt_clf = dt_clf.fit(X, Y)
dt_acc_score = cross_val_score(dt_clf, X, Y , scoring='accuracy', cv=10)
print("%0.3f" % np.mean(dt_acc_score))
dt_clf = DecisionTreeClassifier(criterion = "entropy",  max_depth = 10)
dt_clf = dt_clf.fit(X, Y)
dt_acc_score = cross_val_score(dt_clf, X, Y , scoring='accuracy', cv=10)
print("%0.3f" % np.mean(dt_acc_score))

DECISION TREE ACCURACIES FOR VARIOUS EXPERIMENTS:
0.685
0.679
0.679
0.687
0.691
0.683


Since The Model With Best Performance Measure Was Logistic Regression With Parameter: random_state = 2, we use this for Predicting Test Data And Finding Top Features For Each Class Label:

In [8]:
#Chosen Model With Best Performance
lr_clf = LogisticRegression(random_state = 2)
lr_clf.fit(X,Y)
print("ACCURANCY OF BEST MODEL:")
Acc_Score = cross_val_score(lr_clf,X,Y, scoring='accuracy',cv=10)

ACCURANCY OF BEST MODEL:


In [9]:
#Splitting Data For Predicting Target Label
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
#lr_clf_pred = LogisticRegression(random_state = 2)
#lr_clf_pred.fit(X_train,y_train)
#y_pred = lr_clf_pred.predict(X_test)
#y_pred = y_pred.toarray()
#print("Predicted labels are", y_pred)
#print (y_test.astype(np.array))
#print("Actual labels are", )


In [10]:
# What are the top weighted features?

# Get the learned coefficients for the Positive class.
coef = lr_clf.coef_[2]
# Sort them in descending order.
top_coef_ind = np.argsort(coef)[::-1][:10]
# Get the names of those features.
top_coef_terms = vocab[top_coef_ind]
# Get the weights of those features
top_coef = coef[top_coef_ind]
# Print the top 10.
print('top weighted terms for positive class:')
[x for x in zip(top_coef_terms, top_coef)]

top weighted terms for positive class:


[('thank', 3.3030894349474957),
 ('awesome', 3.1160818253206344),
 ('excellent', 2.9245982834469455),
 ('thanks', 2.8090404191408966),
 ('kudos', 2.6521922550212698),
 ('amazing', 2.553455131579506),
 ('great', 2.5506392969808065),
 ('wonderful', 2.3234464565207706),
 ('exceptional', 2.203178962889397),
 ('thx', 2.2018409229666926)]

In [11]:
# What are the top weighted features?

# Get the learned coefficients for the Neutral class.
coef = lr_clf.coef_[1]
# Sort them in descending order.
top_coef_ind = np.argsort(coef)[::-1][:10]
# Get the names of those features.
top_coef_terms = vocab[top_coef_ind]
# Get the weights of those features
top_coef = coef[top_coef_ind]
# Print the top 10.
print('top weighted terms for neutral class:')
[x for x in zip(top_coef_terms, top_coef)]

top weighted terms for neutral class:


[('hungupnohelp', 1.8175029455304572),
 ('apple', 1.6053684587693655),
 ('hi', 1.5983790478535986),
 ('resolutions', 1.4990238478486217),
 ('australia', 1.4408634022980562),
 ('mexico', 1.4273061599504395),
 ('discounts', 1.4272609002383718),
 ('anytime', 1.3498770261932362),
 ('ceo', 1.3407933916221932),
 ('winter', 1.3358910955288446)]

In [None]:
# What are the top weighted features?

# Get the learned coefficients for the Negative class.
coef = lr_clf.coef_[0]
# Sort them in descending order.
top_coef_ind = np.argsort(coef)[::-1][:10]
# Get the names of those features.
top_coef_terms = vocab[top_coef_ind]
# Get the weights of those features
top_coef = coef[top_coef_ind]
# Print the top 10.
print('top weighted terms for negative class:')
[x for x in zip(top_coef_terms, top_coef)]

top weighted terms for negative class:


[('worst', 3.4091999520825467),
 ('ridiculous', 2.3527754645333374),
 ('fail', 2.1527729410743266),
 ('disappointed', 2.0916681301544298),
 ('rude', 2.0605332325537442),
 ('unacceptable', 2.0434687712633135),
 ('worse', 2.0416875833448049),
 ('hrs', 2.0276281913827545),
 ('terrible', 1.9728353921483781),
 ('sucks', 1.9203371760627059)]

In [None]:
mlp_clf = MLPClassifier(random_state=2)
mlp_clf = mlp_clf.fit(X,Y)
mlp_acc_score = cross_val_score(mlp_clf, X, Y, cv=5, scoring='accuracy')
print("%0.3f" % np.mean(mlp_acc_score))