In [133]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import NaiveBayesClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import pos_tag
from nltk.corpus import stopwords
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
from sklearn import datasets
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

## Loading Training and Testing Data and converting it to X and Y

In [12]:
training_df = pd.read_csv("twitter_x_y_train.csv")

In [13]:
training_df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [40]:
testing_df = pd.read_csv("twitter_x_test.csv")
testing_df.head()

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)


In [31]:
training_df = training_df[['text','airline_sentiment']]
training_df.iloc[0,:]

text                 @SouthwestAir I am scheduled for the morning, ...
airline_sentiment                                             negative
Name: 0, dtype: object

In [32]:
print(training_df.shape)

(10980, 2)


In [34]:
training_data = []
for i in range(len(training_df)):
    training_data.append([training_df['text'][i], training_df['airline_sentiment'][i]])
print(training_data[0])

['@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled', 'negative']


In [35]:
print(len(training_data))

10980


In [37]:
X = [i[0] for i in training_data]
Y = [i[1] for i in training_data]

In [38]:
print(len(X), X[0])
print(len(Y),Y[0])

10980 @SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled
10980 negative


In [42]:
testing_data = []
for i in range(len(testing_df)):
    testing_data.append(testing_df['text'][i])
print(testing_data[0])
print(len(testing_data))

@AmericanAir In car gng to DFW. Pulled over 1hr ago - very icy roads. On-hold with AA since 1hr. Can't reach arpt for AA2450. Wat 2 do?
3660


## Cleaning the training and testing data to remove stop words and punctuations and also lemmatize the words

In [46]:
stop_words = set(stopwords.words('english'))
punc = string.punctuation
stop_words.update(punc)

In [51]:
X_train_words = [nltk.word_tokenize(i) for i in X]
X_test_words = [nltk.word_tokenize(i) for i in testing_data]

In [52]:
print(len(X_train_words), len(X_test_words))

10980 3660


In [54]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [53]:
lemmatizer = WordNetLemmatizer()

In [55]:
def clean_data(words):
    output_words = []
    for w in words:
        if w.lower() not in stop_words:
            pos = pos_tag(w)
            clean_word = lemmatizer.lemmatize(w, pos = get_wordnet_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [56]:
X_train_words = [clean_data(words) for words in X_train_words]
X_test_words = [clean_data(words) for words in X_test_words]

In [58]:
print(len(X_train_words), len(X_test_words))

10980 3660


In [59]:
X_train_cleaned = [str(' '.join(word)) for word in X_train_words]
X_test_cleaned = [str(' '.join(word)) for word in X_test_words]
print(len(X_train_cleaned),len(X_test_cleaned))

10980 3660


In [62]:
print(X_train_cleaned[0],"\n",X_test_cleaned[0])

southwestair scheduled morning 2 days fact yes..not sure evening flight one cancelled flightled 
 americanair car gng dfw pulled 1hr ago icy road on-hold aa since 1hr ca n't reach arpt aa2450 wat 2


In [259]:
# A = X_train_cleaned
# b = Y

## Using Vectorizer

In [299]:
x_train, x_test, y_train, y_test = train_test_split(X_train_cleaned, Y, random_state = 0)

In [300]:
count_vectorizer = CountVectorizer(max_features = 3000)
tfidf_vectorizer = TfidfVectorizer(max_features = 3000)

In [301]:
x_train_cv = count_vectorizer.fit_transform(x_train)
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_cv = count_vectorizer.transform(x_test)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [289]:
# a = count_vectorizer.fit_transform(A)

In [302]:
X_test_cleaned_tfidf = tfidf_vectorizer.transform(X_test_cleaned)
X_test_cleaned_cv = count_vectorizer.transform(X_test_cleaned)

## SVM

In [303]:
svc = SVC()

In [304]:
svc.fit(x_train_tfidf,y_train)
svc.score(x_test_tfidf,y_test)

0.7719489981785064

In [217]:
grid = {'C' : [1e2, 1e3, 5e3, 1e4, 5e4, 1e5],
       'gamma' : [1e-3, 5e-4, 1e-4, 5e-3]}
abc = GridSearchCV(svc, grid)
abc.fit(x_train_tfidf,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [100.0, 1000.0, 5000.0, 10000.0, 50000.0,
                               100000.0],
                         'gamma': [0.001, 0.0005, 0.0001, 0.005]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [218]:
abc.best_estimator_

SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.005, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [305]:
y_predict = svc.predict(X_test_cleaned_tfidf)

## Linear SVC

In [239]:
lsvc = SVC(kernel = "linear", C=1)

In [278]:
lsvc.fit(x_train_tfidf,y_train)
lsvc.score(x_test_tfidf,y_test)

0.7770491803278688

In [231]:
grid1 = {'C' : [1e-4, 1e-3, 1e-2, 0.1, 1,1e2, 1e3, 5e3, 1e4, 5e4, 1e5]}
abc1 = GridSearchCV(lsvc, grid1)
abc1.fit(x_train_tfidf,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 100.0, 1000.0,
                               5000.0, 10000.0, 50000.0, 100000.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [232]:
abc1.best_estimator_

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [276]:
y_predict = lsvc.predict(X_test_cleaned_tfidf)

## Multinomial NB

In [294]:
mnb = MultinomialNB(alpha = 1)
mnb.fit(x_train_cv,y_train)
mnb.score(x_test_cv,y_test)

0.7515482695810565

In [292]:
y_predict = mnb.predict(X_test_cleaned_cv)

## Logistic Regression

In [127]:
lr = LogisticRegression(max_iter = 1000)

In [298]:
lr.fit(x_train_cv,y_train)
lr.score(x_test_cv,y_test)

0.775591985428051

In [296]:
y_predict = lr.predict(X_test_cleaned_cv)

## KNN

In [134]:
knn = KNeighborsClassifier()

In [162]:
knn.fit(x_train_cv,y_train)
knn.score(x_test_cv,y_test)

0.47978142076502733

## Random Forest

In [136]:
rf = RandomForestClassifier()

In [163]:
rf.fit(x_train_cv,y_train)
rf.score(x_test_cv,y_test)

0.7406193078324226

## Save Output

In [306]:
opdf = pd.DataFrame(y_predict)
opdf.to_csv('predictions.csv', index = False, header = False)