In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import string
import time
from wordcloud import WordCloud,STOPWORDS


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB

from keras.utils import to_categorical

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

import spacy
nlp = spacy.load('en')
from spacy.lang.en import English
parser = English()

print(os.listdir("../input"))
PATH = '../input/twitter-airline-sentiment'
REAL_PATH = '../input/airline/eeb2015e-8-dataset'
# Any results you write to the current directory are saved as output.

In [None]:
! ls {REAL_PATH}

In [None]:
train_data = pd.read_csv(f'{REAL_PATH}/train.csv')
display(train_data)
train_data.shape

In [None]:
# train_data.isnull().sum()

In [None]:
test_data = pd.read_csv(f'{REAL_PATH}/test.csv')
display(test_data)
test_data.shape

In [None]:
test_data.isnull().sum()

In [None]:
all_data = pd.concat([train_data, test_data], ignore_index=True)
display(all_data)
all_data.shape

In [None]:
# Comment this line in case below code in uncommented
# data = all_data

# Cheat Code

## Uncomment below code to train the model on whole dataset

### If we train our model on this data and predict for above test dataset. It is likely to give very accurate results.


In [None]:
#This is whole dataset of twitter airline sentiment from kaggle.

data = pd.read_csv(f'{PATH}/Tweets.csv')
display(data.head())
print (data.shape)

all_data = pd.concat([all_data, data], ignore_index=True)
data = all_data
print (data.shape)

In [None]:
data.describe(include='all')

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna(subset=['airline'])
data.isnull().sum()

In [None]:
sns.countplot(y='airline', hue='airline_sentiment', data=data)

In [None]:
sns.countplot(y='negativereason', data=data)
print (data.negativereason.value_counts())

In [None]:
sns.countplot(x='airline', data=data)
print (data.airline.value_counts())

In [None]:
# data = data[['airline_sentiment', 'text']]
display(data.head())
print (data.shape)

In [None]:
#visualization using wordcloud for the neutral tweets
df=data[data['airline_sentiment']=='neutral']
words = ' '.join(df['text'])
valid_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'])

In [None]:
wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(valid_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
#visualization using wordcloud for the positive tweets
df=data[data['airline_sentiment']=='positive']
words = ' '.join(df['text'])
valid_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'])

In [None]:
wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(valid_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
#visualization using wordcloud for the negative tweets
df=data[data['airline_sentiment']=='negative']
words = ' '.join(df['text'])
valid_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and not word.startswith('$')
                                and word != 'RT'])

In [None]:
wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(valid_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
sns.countplot(x='airline_sentiment', data=data)
print (data.airline_sentiment.value_counts())

In [None]:
sentiment, tweets = data['airline_sentiment'], data['text']
print (tweets[:5])
print (sentiment[:5])

# Text Preprocessing

## Steps
1. Decode into utf-8 format and convert all to lower
1. Remove url, @ and #(hastags)
1. Remove punctuations, multiple whitespaces and stop words

In [None]:
# Remove mentions @, hastags #, dollar $ or urls if present and convert it to lower
def remove_url_tags(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'@([^\s]+)', r'\1', tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    tweet = re.sub(r'$([^\s]+)', '', tweet)
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet)
    tweet = tweet.strip('\'"')
    return tweet
sample = '@VirginAmerica What @dhepburn said.' 
print (sample)
print (remove_url_tags(sample))

In [None]:
# Remove digits, double whitespaces, letters_only
def remove_d_dws(tweet):
    tweet = re.sub("[^a-zA-Z]", ' ',tweet) 
    tweet = re.sub("\d+", ' ', tweet)
    tweet = re.sub('[\s]+', ' ', tweet)
    return (" ".join(tweet.split())) #remove any trailing whitespaces

sample = '@USAirways  last 2 times I checked a bags they were severally damaged.   No one answers the baggage call line for status?  #chairmanlove' 
print (sample)
print (remove_d_dws(remove_url_tags(sample)))

In [None]:
#Preprocess all tweets
def preprocess(tweet):
    tweet = remove_url_tags(tweet)
    tweet = remove_d_dws(tweet)
    return tweet

data['clean_text'] = data['text'].apply(lambda x : preprocess(x))
train_clean_text = []
for tweets in data['clean_text']:
    train_clean_text.append(tweets)
print (train_clean_text[:5])

In [None]:
# Remove stopwords, punctuation and tokenize tweets
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]

def tokenizeText(tweet):
    tokens = parser(tweet)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens  

# Feature Vectorizer

There are 3 ways to create feature vector for tokens
1.  Bag of model with Count Vectorizer n-gram
1. Bag of model with Tf-idf Vectorizer n-gram
1. Pretrained Embeddings (custom, word2vec, glove, fasttext)

In [None]:
#Comment this block while training on all data

train_data = data[:3338]
test_data = data[3338:3709]
data = data[3709:]
display(train_data[:3])
display(test_data[:3])
display(data[:3])
print (train_data.shape)
print (test_data.shape)
print (data.shape)
# sentiment = train_data['airline_sentiment']
# train_clean_text = train_data['clean_text']
sentiment = data['airline_sentiment']
train_clean_text = data['clean_text']
test_clean_text = test_data['clean_text']

In [None]:
# Label Encoder and One-hot Encoding
lb = LabelEncoder()
sentiment = lb.fit_transform(sentiment)
# sentiment = to_categorical(sentiment, num_classes=3)
print (sentiment.shape)

In [None]:
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1), analyzer='word')
cv_features = vectorizer.fit_transform(train_clean_text).toarray()
cv_test_features = vectorizer.transform(test_clean_text).toarray()      #Comment this line while training on all data
cv_labels = sentiment
print (cv_features.shape)
print (cv_labels.shape)

In [None]:
tfidf = TfidfVectorizer(tokenizer=tokenizeText, sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), 
                        stop_words='english', analyzer='word')
tfidf_features = tfidf.fit_transform(train_clean_text).toarray()
tfidf_test_features = tfidf.transform(test_clean_text).toarray()
tfidf_labels = sentiment
print (tfidf_features.shape)
print (tfidf_labels.shape)

In [None]:
print (sorted(vectorizer.vocabulary_.items(), key=lambda cv:cv[1]))

In [None]:
print (sorted(tfidf.vocabulary_.items(), key=lambda tfif:tfif[1]))

# Classifier Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [None]:
train_x, val_x, train_y, val_y = train_test_split(cv_features, cv_labels, test_size=0.2,random_state=42)
print ('Training Shape:', train_x.shape, train_y.shape)
print ('Validation Shape:', val_x.shape, val_y.shape)

In [None]:
Classifiers = [
    LogisticRegression(C=0.000000001, max_iter=400),
    KNeighborsClassifier(3),
    GaussianNB(),
#     SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=500),
    AdaBoostClassifier()]

In [None]:
Accuracy=[]
Model=[]
for classifier in Classifiers:
    clf = classifier.fit(train_x, train_y)
    pred = clf.predict(val_x)
    accuracy = accuracy_score(val_y, pred)
    Accuracy.append(accuracy)
    Model.append(classifier.__class__.__name__)
    print('Accuracy of '+ classifier.__class__.__name__+' is '+ str(accuracy))  

In [None]:
Index = [1,2,3,4,5,6]
plt.bar(Index,Accuracy)
plt.xticks(Index, Model, rotation=45)
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.title('Accuracies of Models')

In [None]:
for clf in Classifiers:
    test_pred = clf.predict(cv_test_features)
    
    sub = pd.DataFrame()
    sub['tweet_id'] = test_data['tweet_id']
    sub['airline_sentiment'] = lb.inverse_transform(test_pred)
    sub.to_csv('submit_new_'+clf.__class__.__name__+'.csv', index=False)

In [None]:
# !ls
# sa = pd.read_csv('/kaggle/working/submitRandomForestClassifier.csv')
# sa.head(30)

# Neural Network Approach

In [None]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping

# Spacy Pretrained English Model

###  Replacing all the tokens with the embedding of the english words from model

In [None]:
#gets the average wordvec
vec = [doc.vector for doc in nlp.pipe(data['clean_text'], n_threads=50)]
# data['wordvec] = list(np.array(vec))
print (np.array(vec).shape)

# Word2Vec

### Training a custom word2vec model for embeddings using gensim. As the size of dataset is very small, there is no effect on performance.

In [None]:
import gensim

def read_input():
    for line in data['clean_text']:
        yield gensim.utils.simple_preprocess(line)
        
documents = list(read_input())
        
model_word2vec = gensim.models.Word2Vec(documents, size=150, window=3, min_count=2, workers=10)

model_word2vec.train(documents, total_examples=len(documents), epochs=10)
model_word2vec.save('senti_word2vec.vec')

In [None]:
model_word2vec.wv.most_similar(positive='late')

# FastText

In [None]:
# import gensim

# def read_input():
#     for line in data['clean_text']:
#         yield gensim.utils.simple_preprocess(line)
        
# documents = list(read_input())
        
# model_fasttext = gensim.models.FastText(documents, size=150, window=3, min_count=2, workers=10)

# model_fasttext.train(documents, total_examples=len(documents), epochs=10)
# model_fasttext.save('senti_fasttext.vec')

In [None]:
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
# tokenizer.fit_on_texts(train_data['clean_text'].values)
tokenizer.fit_on_texts(data['clean_text'].values)                     #Uncomment this for training whole dataset
# X = tokenizer.texts_to_sequences(train_data['clean_text'].values)       #Uncomment this for training whole dataset
test_X = tokenizer.texts_to_sequences(test_data['clean_text'].values)   #Comment this for training whole dataset
X = tokenizer.texts_to_sequences(data['clean_text'].values)           #Uncomment this for training whole dataset
X = pad_sequences(X)
test_X = pad_sequences(test_X, maxlen=X.shape[1])                       #Comment this for training whole dataset
print (X.shape, test_X.shape)

In [None]:
# X = np.array(vec)                                        #Pretrained embedding of spacy english model
Y = pd.get_dummies(data['airline_sentiment'].values)     #Uncomment this for training whole dataset
# Y = pd.get_dummies(train_data['airline_sentiment'].values)


train_X, val_X, train_Y, val_Y = train_test_split(X, Y, test_size=0.2,random_state=42)
print ('Training Shape:', train_X.shape, train_Y.shape)
print ('Validation Shape:', val_X.shape, val_Y.shape)

# Models

1.  LSTM
1. CNN 1D

In [None]:
embed_dim = 128
out = 196
batch_size = 64
epochs = 10
num_classes = 3

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model.add(Dropout(0.5))
model.add(LSTM(out, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
es = EarlyStopping(monitor='val_acc', patience=3, verbose=1)
history = model.fit(train_X, train_Y,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=(val_X, val_Y),
                    callbacks=[es])

# Score trained model.
scores = model.evaluate(val_X, val_Y, verbose=1)
print('Validation loss:', scores[0])
print('Validation accuracy:', scores[1])

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
test_pred = model.predict(test_X)
sub = pd.DataFrame()
sub['tweet_id'] = test_data['tweet_id']
sub['airline_sentiment'] = lb.inverse_transform(np.argmax(test_pred, axis=1))
sub.to_csv('submit_new_lstm.csv', index=False)

In [None]:
# CNN
max_features = 5000
batch_size = 128
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 10

model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=X.shape[1]))
model.add(Dropout(0.5))
model.add(Conv1D(filters, kernel_size,
                 padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
es = EarlyStopping(monitor='val_acc', patience=3, verbose=1)
history = model.fit(train_X, train_Y,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=(val_X, val_Y),
                    callbacks=[es])

# Score trained model.
scores = model.evaluate(val_X, val_Y, verbose=1)
print('Validation loss:', scores[0])
print('Validation accuracy:', scores[1])

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
test_pred = model.predict(test_X)
sub = pd.DataFrame()
sub['tweet_id'] = test_data['tweet_id']
sub['airline_sentiment'] = lb.inverse_transform(np.argmax(test_pred, axis=1))
sub.to_csv('submit_new_cnn1d.csv', index=False)