In [50]:
import numpy as np
import pandas as pd
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('vader_lexicon')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import xgboost as xgb

## Preprocessing

In [51]:
df = pd.read_csv('Conversations.csv'); df.head()

Unnamed: 0,Conversations
0,['Hey @British_Airways our @easyJet flight Ven...
1,['@easyJet thanks for looking after my luggage...
2,"[""@British_Airways I booked via Expedia. I'm n..."
3,['@easyJet is there anyway to travel with 2 ca...
4,['@easyJet how nice of you not to reply to my ...


In [70]:
class PreProcessor:
    '''
    Easily performs all the standard preprocessing steps
    like removing stopwords, stemming, etc.
    Only input that you need to provide is the dataframe
    '''
    def __init__(self, df, column_name):
        self.data = df
        self.conversations = self.data[column_name]
        self.stopwords = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        self.preprocessed = []
        
    def tokenize(self, sentence):
        '''
        Splits up words and makes a list of all words in the tweet
        '''
        tokenized_sentence = word_tokenize(sentence)
        return tokenized_sentence
            
    def remove_stopwords(self, sentence):
        '''Removes stopwords like 'a', 'the', 'and', etc.'''
        filtered_sentence = []
        for w in sentence:
            if w not in self.stopwords and len(w) > 1 and w[:2] != '//' and w != 'https': 
                filtered_sentence.append(w)
        return filtered_sentence
    
    def stem(self, sentence):
        '''
        Stems certain words to their root form.
        For example, words like 'computer', 'computation'
        all get trunacated to 'comput'
        '''
        return [self.stemmer.stem(word) for word in sentence]
    
    def join_to_string(self, sentence):
        '''
        Joins the tokenized words to one string.
        '''
        return ' '.join(sentence)
    
    def full_preprocess(self, n_rows=None):
        '''
        Preprocess a selected number of rows and
        connects them back to strings
        '''
        # If nothing is given do it for the whole dataset
        if n_rows == None:
            n_rows = len(self.data)
            
        # Perform preprocessing
        for i in range(n_rows):
            tweet = self.conversations[i]
            tokenized = self.tokenize(tweet)
            cleaned = self.remove_stopwords(tokenized)
            stemmed = self.stem(cleaned)
            joined = self.join_to_string(stemmed)
            self.preprocessed.append(joined)
        return self.get_data()
            
    def list_preprocess(self, n_rows=None):
        '''
        Preprocess everything but keep the
        tokenized version (list of words)
        '''
        # If nothing is given do it for the whole dataset
        if n_rows == None:
            n_rows = len(self.data)
            
        # Perform preprocessing
        for i in range(n_rows):
            tweet = self.conversations[i]
            tokenized = self.tokenize(tweet)
            cleaned = self.remove_stopwords(tokenized)
            stemmed = self.stem(cleaned)
            self.preprocessed.append(stemmed)
        return self.get_data()
    
    def get_data(self):
        '''
        Get the result of the preprocessing
        '''
        return self.preprocessed

In [73]:
preprocessor = PreProcessor(df, column_name='Conversations')
preprocessor.full_preprocess(100)
data = preprocessor.get_data()

In [54]:
#tweet = df['Conversations'][1]; tweet
# Tokenize words in string
#example = word_tokenize(tweet)
#print(example)

In [55]:
# Remove stop words and links
#stopWords = set(stopwords.words('english'))
#wordsFiltered = []
#for w in example:
#    if w not in stopWords and len(w) > 1 and w[:2] != '//' and w != 'https': 
#        wordsFiltered.append(w)
#print(wordsFiltered)

In [56]:
# Stem words
#stemmer = SnowballStemmer("english")
#wordsFilteredandStemmed = [stemmer.stem(word) for word in wordsFiltered]
#wordsFilteredandStemmed

## Sentiment Analysis

In [57]:
# Basic Analysis with textblob
#from textblob import TextBlob # Basic Tweet analysis
#tweet = ' '.join(wordsFilteredandStemmed)
#print(tweet)
#analysis = TextBlob(tweet)
#print(f'Polarity: {analysis.sentiment[0]}, Subjectivity: {analysis.sentiment[1]}')

In [58]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
neg_sent = []
neu_sent = []
pos_sent = []
for sentence in data:
    ss = sid.polarity_scores(sentence)
    neg_sent.append(ss['neg'])
    neu_sent.append(ss['neu'])
    pos_sent.append(ss['pos'])
    for k in sorted(ss):
        pass
        #print('{0}: {1}, '.format(k, ss[k]), end='')    


print(np.mean(pos_sent))
print(np.mean(neu_sent))
print(np.mean(neg_sent))

0.13278
0.8060699999999998
0.061189999999999994


In [59]:
#import pickle
#xgb_model = pickle.load(open('xgboost_1.dat', 'rb'))

In [83]:
df_tweets = pd.read_csv('tweets.csv')
preprocessor = PreProcessor(df_tweets, column_name='text')
df_tweets['cleaned_text'] = preprocessor.full_preprocess(len(df_tweets))
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_tweets['cleaned_text'])
X_test = vectorizer.transform(data)
print(f'Training size: {X_train.shape[0]} tweets\n\
Test size: {X_test.shape[0]} tweets\n\
Amount of words (columns): {X_train.shape[1]} words')
# Get training and validation data
y_train = df_tweets['airline_sentiment']

Training size: 14640 tweets
Test size: 100 tweets
Amount of words (columns): 11400 words


In [86]:
# Hyperparameters that you can tweak
# There are a lot more tweakable hyperparameters that you can find at 
# https://xgboost.readthedocs.io/en/latest/parameter.html
xgb_params = {'objective' : 'multi:softmax',
              'eval_metric' : 'mlogloss',
              'eta' : 0.1,
              'max_depth' : 6,
              'num_class' : 3,
              'lambda' : 0.8,
              'estimators' : 200,
              'seed' : 1234
              
}

# Transform categories into numbers
# negative = 0, neutral = 1 and positive = 2
target_train = y_train.astype('category').cat.codes

# Transform data into a matrix so that we can use XGBoost
d_train = xgb.DMatrix(X_train, label = target_train)

# Fit XGBoost
watchlist = [(d_train, 'train')]
bst = xgb.train(xgb_params, 
                d_train, 
                400,  
                watchlist,
                early_stopping_rounds = 50, 
                verbose_eval = 0)

In [92]:
d_test = xgb.DMatrix(X_test)
df_pred = pd.DataFrame({'tweet' : data, 
                        'xgboost_pred' : bst.predict(d_test)})

In [94]:
df_pred.head()

Unnamed: 0,tweet,xgboost_pred
0,hey british_airway easyjet flight venice-gatwi...,0.0
1,easyjet thank look luggag nice edinburgh fligh...,2.0
2,`` british_airway book via expedia 'm complain...,2.0
3,easyjet anyway travel cabin bag ryan_2807 hey ...,0.0
4,easyjet nice repli tweet thank easyjet `` west...,2.0
