In [3]:
# Importing required libraries
import string
import numpy as np
import pandas as pd
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
# Loading training and testing data
train = pd.read_csv('./training_data.csv')
test = pd.read_csv('./test_data.csv')

# Data Preprocessing

In [5]:
train.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [6]:
train.isnull().sum()

tweet_id                      0
airline_sentiment             0
airline                       0
airline_sentiment_gold    10949
name                          0
negativereason_gold       10956
retweet_count                 0
text                          0
tweet_coord               10204
tweet_created                 0
tweet_location             3550
user_timezone              3577
dtype: int64

In [7]:
train_data = np.array(train['text'])
train_output = np.array(train['airline_sentiment'])
test_data = np.array(test['text'])

In [8]:
# Using Lemmatizer to group togetger forms of inflected forms
lemmatizer = WordNetLemmatizer()

In [9]:
# To remove Airline names from tweets
not_reqd = set(train['airline'])
not_reqd

{'American', 'Delta', 'Southwest', 'US Airways', 'United', 'Virgin America'}

In [10]:
# Importing stopwords using nltk and adding punctuations as well as few more words
stops = set(stopwords.words("english"))
punctuations = string.punctuation
stops = list(stops) + list(punctuations) + list(not_reqd) + ['@','http','americanair', 'JetBlue','SouthwestAir', 'USAirways']

In [11]:
# Function to return POS in the format required by lemmatizer
def simple_pos(word):
    if word.startswith('J'):
        return wordnet.ADJ
    if word.startswith("V"):
        return wordnet.VERB
    if word.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function to remove stops(stopwords) from the tweets
def clean_word(words):
    output_words = []
    for w in words:
        if (w.lower() not in stops) and (w.isnumeric() == False):
            pos = pos_tag(w)
            clean_word = lemmatizer.lemmatize(w, pos = simple_pos(pos[0][1]))
            output_words.append(clean_word)
    return output_words

In [12]:
# Joining the words in sentence after cleaning the words
train = [' '.join(clean_word(word_tokenize(word))) for word in train_data]

In [13]:
train

['SouthwestAir scheduled morning days fact yes..not sure evening flight one Cancelled Flightled',
 'SouthwestAir seeing worker time time going beyond love flying guy Thank',
 'united Flew ORD Miami back great crew service leg THANKS',
 "SouthwestAir dultch97 's horse radish 😤🐴",
 'united flight ORD delayed Air Force One last flight SBN 8:20 min landed',
 'united load us flying sardine knew pilot hour Late Flight incompetent beyond belief',
 "JetBlue stock response Delays frustrating poor cust serv amp told ppl wait amp 'd come back",
 "JetBlue 'd nice Hoping rack enough mile take trip Seattle enjoy perfect latte city coffee",
 'united frankly worse customer service ever Problems happen deal defines company Never United',
 "SouthwestAir yeah haha Never one 's expensive 😂😂 much fun destinationdragons",
 "SouthwestAir MCO- gt DCA flight almost full people screwed MSY-DCA Cancelled Flightation united USAirways n't Cancelled Flight SWA=mistake",
 "JetBlue 's easiest way get ticket receipt g

In [14]:
# Splitting the data
x_train1, x_train2, y_train1, y_train2 = train_test_split(train, train_output, random_state = 0)

In [15]:
# Applying Count Vectorizer on training data
count_vec = CountVectorizer(max_features=4000)
x_train1_features = count_vec.fit_transform(x_train1)
x_train1_features

<8235x4000 sparse matrix of type '<class 'numpy.int64'>'
	with 73143 stored elements in Compressed Sparse Row format>

In [16]:
# Applying Count Vectorizer on testing data
x_train2_features = count_vec.transform(x_train2)
x_train2_features

<2745x4000 sparse matrix of type '<class 'numpy.int64'>'
	with 24215 stored elements in Compressed Sparse Row format>

In [17]:
# List of features selected
count_vec.get_feature_names()

['00',
 '000',
 '02',
 '03',
 '05',
 '10',
 '100',
 '10a',
 '10hrs',
 '10pm',
 '11',
 '11am',
 '11th',
 '12',
 '130',
 '13th',
 '14',
 '15',
 '15th',
 '16',
 '17',
 '19',
 '1hr',
 '1k',
 '1pm',
 '1st',
 '20',
 '200',
 '2015',
 '20min',
 '21',
 '21st',
 '22',
 '23',
 '24',
 '24hrs',
 '24th',
 '25',
 '26',
 '27',
 '28',
 '2d',
 '2day',
 '2days',
 '2hr',
 '2hrs',
 '2littlebirds',
 '2nd',
 '2pm',
 '2x',
 '2xdaily',
 '2y',
 '30',
 '300',
 '30a',
 '30am',
 '30k',
 '30min',
 '30mins',
 '30pm',
 '31',
 '320008a',
 '35',
 '35pm',
 '35x',
 '36',
 '3am',
 '3d',
 '3fq3xelbon',
 '3hrs',
 '3pm',
 '3rd',
 '3thparty',
 '3x',
 '3yr',
 '40',
 '40mins',
 '40pm',
 '40th',
 '41g',
 '42',
 '428',
 '4322',
 '433',
 '45',
 '45am',
 '45min',
 '45mins',
 '45pm',
 '47',
 '4am',
 '4hrs',
 '4ojrsdwpkk',
 '4pm',
 '4th',
 '4ward',
 '4x',
 '50',
 '500',
 '50am',
 '50k',
 '50pm',
 '51',
 '55',
 '55am',
 '55pm',
 '58',
 '5am',
 '5hrs',
 '5pm',
 '5th',
 '60',
 '64gb',
 '65',
 '6am',
 '6hrs',
 '6pm',
 '6th',
 '70',
 '700

In [72]:
# Applying SVM classifier on data
svc = SVC()
svc.fit(x_train1_features, y_train1)
svc.score(x_train2_features, y_train2)

0.7635701275045538

In [73]:
# Applying Multinomial Naive Bayes classifier on data
nb = MultinomialNB()
nb.fit(x_train1_features, y_train1)
nb.score(x_train2_features, y_train2)

0.7595628415300546

In [74]:
# Applying Random Forest classifier on data
rf = RandomForestClassifier()
rf.fit(x_train1_features, y_train1)
rf.score(x_train2_features, y_train2)

0.7442622950819672

In [75]:
# Applying Logistic Regression classifier on data
lr = LogisticRegression(max_iter=200)
lr.fit(x_train1_features, y_train1)
lr.score(x_train2_features, y_train2)

0.7777777777777778

In [18]:
# Applying Count Vectorizer on test data
count_vec = CountVectorizer(max_features=4000)
x_train_features = count_vec.fit_transform(train)
x_test_features = count_vec.transform(test_data)

In [52]:
svc = SVC()
svc.fit(x_train_features, train_output)
y_pred = svc.predict(x_test_features)

In [79]:
lr = LogisticRegression(max_iter=250)
lr.fit(x_train_features, train_output)
y_pred = lr.predict(x_test_features)

In [80]:
len(y_pred)

3660

In [81]:
y_pred

array(['negative', 'negative', 'negative', ..., 'negative', 'positive',
       'neutral'], dtype=object)

In [56]:
np.savetxt("Predictions-1.csv", y_pred, fmt='%s', delimiter=',')

# Using Neural Network

In [62]:
# Importing Libraries
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

In [48]:
# Converting the data into labels
label = LabelEncoder()
y_train = label.fit_transform(train_output)

In [49]:
y_train.shape

(10980,)

In [50]:
x_train_features.shape

(10980, 4000)

In [51]:
# Encoding the data into categorical form
y_encoded = to_categorical(np.array(y_train))

In [52]:
# Adding hidden and output layers to the model
model = Sequential()
model.add(Dense(units=3000, activation='relu', input_dim=4000))
model.add(Dense(units=2000, activation='relu'))
model.add(Dense(units=1000, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

In [53]:
# Specifying and compiling the loss function and optimizer
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [54]:
model.fit(x_train_features, y_encoded, epochs=10, batch_size=250)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x28142def108>

In [55]:
# Predicting the values for test data
predictions = model.predict(x_test_features)

In [63]:
# Picking only the max value for each datarow
predict = tf.argmax(predictions,1)

In [72]:
sess = tf.Session()
arr = np.array(sess.run(predict))

In [73]:
arr

array([0, 0, 0, ..., 0, 2, 1], dtype=int64)

In [76]:
# Converting the data into original categories
final = []
for i in arr:
    if i == 0:
        final.append('negative')
    elif i==1:
        final.append('neutral')
    else:
        final.append('positive')

In [77]:
final

['negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'neutral',
 'positive',
 'negative',
 'positive',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'neutral',
 'neutral',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'negat

In [82]:
np.savetxt("PredictionsNN.csv", final, fmt='%s', delimiter=',')