# Sentiment analysis Twitter Airline dataset with SVM optimized model 

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('twitter-airline-sentiment.csv')
dataset

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
5,570300767074181121,negative,1.0000,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada)
6,570300616901320704,positive,0.6745,,0.0000,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,2015-02-24 11:13:57 -0800,San Francisco CA,Pacific Time (US & Canada)
7,570300248553349120,neutral,0.6340,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,,2015-02-24 11:12:29 -0800,Los Angeles,Pacific Time (US & Canada)
8,570299953286942721,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D",,2015-02-24 11:11:19 -0800,San Diego,Pacific Time (US & Canada)
9,570295459631263746,positive,1.0000,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:27 -0800,Los Angeles,Eastern Time (US & Canada)


Split dataset in positive - negative - neutral

In [3]:
positive = dataset.query('airline_sentiment == "positive"')['text']
negative = dataset.query('airline_sentiment == "negative"')['text']
neutral = dataset.query('airline_sentiment == "neutral"')['text']

Statistics

In [4]:
print('# Positives: ' + str((len(positive))))
print('# Negatives: ' + str((len(negative))))
print('# Neutral: ' + str((len(neutral))))

# Positives: 2363
# Negatives: 9178
# Neutral: 3099


Very unbalanced classes. take the class with less samples (positives) and makes it the "benchmark", then split into train and test data

In [5]:
np_positive = positive.values
np_negative = negative.values
np_neutral = neutral.values

# shuffle
np.random.seed(10)
np.random.shuffle(np_positive)
np.random.shuffle(np_negative)
np.random.shuffle(np_neutral)

# take the same number of sentences
np_positive = np_positive[:min(len(np_positive), len(np_negative), len(np_neutral))]
np_negative = np_negative[:min(len(np_positive), len(np_negative), len(np_neutral))]
np_neutral = np_neutral[:min(len(np_positive), len(np_negative), len(np_neutral))]

Preprocessing. Just TF-IDF with word representation

In [6]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet

def preprocess_word(word):
    # remove punctation
    word = word.strip('\'"?!,.():;*')
    # more than 3 letter repetition removed
    word = re.sub(r'(.)\1\1+', r'\1\1\1', word)
    # remove - & '
    word = word.strip('-&\'')
    return word

def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)

def preprocess_tweet(tweet, use_stemmer=False, use_lemmatizer=False):
    # convert tweet to lowercase
    tweet = tweet.lower()
    # replace urls with 'URL'
    tweet = re.sub(r'((www.[\S]+)|(https?://.[\S]+))', 'URL', tweet)
    # replace user mentions @user with 'USER_MENTION'
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # replace #hashtag with hastag
    tweet = re.sub(r'#(\S+)', r' \1', tweet)
    # remove retweet RT
    tweet = re.sub(r'\brt\b', '', tweet)
    # replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # remove space, " and ' 
    tweet.strip('" \'')
    # handle emojis. Use only EMO_POS and EMO_NEG
    tweet = handle_emojis(tweet)
    # replace multiple spaces with only one space
    tweet = re.sub(r'\s+', ' ', tweet)
    # preprocess words
    words = tweet.split()

    processed_words = []
    porter_stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            if use_stemmer:
                # use stemmer
                word = str(porter_stemmer.stem(word))
            elif use_lemmatizer:
                word = str(wordnet_lemmatizer.lemmatize(word))
            processed_words.append(word)
    return ' '.join(processed_words)

Example of preprocessing

In [7]:
print('######## POSITIVE ########')
print(np_positive[0])
print(preprocess_tweet(np_positive[0]))
print('######## NEGATIVE ########')
print(np_negative[0])
print(preprocess_tweet(np_negative[0]))
print('######## NEUTRAL ########')
print(np_neutral[0])
print(preprocess_tweet(np_neutral[0]))

######## POSITIVE ########
@VirginAmerica Thanks for a great flight from LA to Boston! Pilots did a great job landing in the snow. Can we go back to LA now? #seriously
USER_MENTION thanks for a great flight from la to boston pilots did a great job landing in the snow can we go back to la now seriously
######## NEGATIVE ########
@USAirways and @FlyKnoxville - bad weather should not be an excuse for not following established luggage procedures. #NoClothesNoInfo
USER_MENTION and USER_MENTION bad weather should not be an excuse for not following established luggage procedures noclothesnoinfo
######## NEUTRAL ########
@united Is it possible to redeem miles for one part of a round trip itinerary without having to book two separate reservations?
USER_MENTION is it possible to redeem miles for one part of a round trip itinerary without having to book two separate reservations


Preprocess the whole dataset

In [8]:
# positive
np_positive_preprocess = []
for tweet in np_positive:
    np_positive_preprocess.append(preprocess_tweet(tweet))
# negative
np_negative_preprocess = []
for tweet in np_negative:
    np_negative_preprocess.append(preprocess_tweet(tweet))
# neutral
np_neutral_preprocess = []
for tweet in np_neutral:
    np_neutral_preprocess.append(preprocess_tweet(tweet))

Feature vectors

In [131]:
from nltk import FreqDist
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer

class FeatureVectorGenerator():
    
    def __init__(self, max_words, max_bigrams):
        self.all_words = []
        self.all_bigrams = []
        self.MAX_WORDS = max_words
        self.MAX_BIGRAMS = max_bigrams
        self.VOCAB_SIZE = self.MAX_WORDS + self.MAX_BIGRAMS

    def get_features(self, tweet, words_set, bigrams_set):
        uni_feat = []
        big_feat = []
        words = tweet.split()
        for i in range(len(words)-1):
            if words[i] in words_set:
                uni_feat.append(words[i])
            if (words[i], words[i+1]) in bigrams_set:
                big_feat.append((words[i], words[i+1]))
        if len(words) >= 1 and words[len(words)-1] in words_set:
            uni_feat.append(words[len(words)-1])
        return uni_feat, big_feat

    def extract_features(self, tweets, test=False):
        features = lil_matrix((len(tweets), self.VOCAB_SIZE))
        for j, tweet in enumerate(tweets):
            unigrams, bigrams = self.get_features(tweet, self.all_words_set, self.all_bigrams_set)
            for word in unigrams:
                idx = self.dict_word.get(word)
                if idx:
                    features[j, idx] += 1
            for bigram in bigrams:
                idx = self.dict_bigram.get(bigram)
                if idx:
                    features[j, self.MAX_WORDS + idx] += 1
        # apply tf-idf
        if not test:
            transformer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
            self.tfidf = transformer.fit(features)
        features = self.tfidf.transform(features)
        return features
    
    def fit(self, train_preprocessed_tweets):
        # Count the words and calculate the frequency distribution
        for tweet in train_preprocessed_tweets:
            split = tweet.split()
            for i in range(len(split)-1):
                w = split[i]
                b = (split[i], split[i+1])
                self.all_words.append(w)
                self.all_bigrams.append(b)
            if len(split) >= 1:
                self.all_words.append(split[len(split)-1])
        # Different words
        self.all_words_set = set(self.all_words)
        self.all_bigrams_set = set(self.all_bigrams)
        # Some counts
        print('Number total words: ' + str(len(self.all_words)))
        print('Number total bigrams: ' + str(len(self.all_bigrams)))
        print('Number different words: ' + str(len(self.all_words_set)))
        print('Number different bigrams: ' + str(len(self.all_bigrams_set)))
        # Calculate frequencies
        self.word_freq_dist = FreqDist(self.all_words)
        self.bigram_freq_dist = FreqDist(self.all_bigrams)
        # take the most common words and bigrams
        self.dict_word = {x[0] : i for i, x in enumerate(self.word_freq_dist.most_common(self.MAX_WORDS))}
        self.dict_bigram = {x[0] : i for i, x in enumerate(self.bigram_freq_dist.most_common(self.MAX_BIGRAMS))}

Split datset into train and test

In [132]:
# 0 = neutral,  1 = negative,  2 = positive
TRAIN_SIZE = 2000
train_x = []
train_y = []
test_x = []
test_y = []
train_x.extend(np_positive_preprocess[:TRAIN_SIZE])
train_x.extend(np_negative_preprocess[:TRAIN_SIZE])
train_x.extend(np_neutral_preprocess[:TRAIN_SIZE])
test_x.extend(np_positive_preprocess[TRAIN_SIZE:])
test_x.extend(np_negative_preprocess[TRAIN_SIZE:])
test_x.extend(np_neutral_preprocess[TRAIN_SIZE:])
train_y.extend(np.ones(TRAIN_SIZE) * 2)
train_y.extend(np.ones(TRAIN_SIZE) * 1)
train_y.extend(np.ones(TRAIN_SIZE) * 0)
test_y.extend(np.ones(len(np_positive_preprocess) - TRAIN_SIZE) * 2)
test_y.extend(np.ones(len(np_negative_preprocess) - TRAIN_SIZE) * 2)
test_y.extend(np.ones(len(np_neutral_preprocess) - TRAIN_SIZE) * 2)
# Shuffle train and test datasets
np.random.seed(10)
np.random.shuffle(train_x)
np.random.seed(10)
np.random.shuffle(train_y)
np.random.seed(20)
np.random.shuffle(test_x)
np.random.seed(20)
np.random.shuffle(test_y)

In [139]:
fvg = FeatureVectorGenerator(max_words = 3000, max_bigrams = 4000)
fvg.fit(train_x)
f_train = fvg.extract_features(train_x, test=False)
f_test = fvg.extract_features(test_x, test=True)

Number total words: 90751
Number total bigrams: 84751
Number different words: 7142
Number different bigrams: 42311


In [141]:
print(f_train)
print(f_test)

  (0, 5334)	0.3025114409688067
  (0, 4839)	0.29569404950915074
  (0, 3578)	0.2640117772712248
  (0, 3447)	0.25675187022585827
  (0, 3259)	0.24544894439217405
  (0, 3059)	0.20277809557479476
  (0, 3014)	0.17927307849634247
  (0, 3011)	0.17131906060042792
  (0, 1625)	0.3025114409688067
  (0, 1009)	0.2805328452400966
  (0, 679)	0.2614319868113187
  (0, 413)	0.2380935273827909
  (0, 388)	0.23678138817009337
  (0, 315)	0.22974971457339274
  (0, 258)	0.21716830076424481
  (0, 77)	0.17176957364752768
  (0, 10)	0.10409811929094685
  (0, 8)	0.09904023541401964
  (0, 7)	0.09677716385555829
  (0, 2)	0.08085118537008555
  (0, 1)	0.07254324796189825
  (1, 6215)	0.3101435884147759
  (1, 5335)	0.30181887775180516
  (1, 3448)	0.2561640679905269
  (1, 3347)	0.2501010513478471
  :	:
  (5998, 5)	0.12607081094648673
  (5998, 4)	0.1221503327239067
  (5999, 4546)	0.2679228546479589
  (5999, 4355)	0.2633088273914603
  (5999, 3171)	0.2123105357043957
  (5999, 3153)	0.20857133685954105
  (5999, 3074)	0.1909203

Train SVM model

In [219]:
from sklearn import svm
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from time import time
import pprint
from scipy.stats import randint
from scipy.stats import uniform
from skopt.space import Real, Categorical, Integer

In [185]:
def report_perf(optimizer, X, y, title):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    optimizer.fit(X, y)
    best_score = optimizer.best_score_
    best_score_std = optimizer.cv_results_['std_test_score'][optimizer.best_index_]
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [178]:
# naive svm classifier
clf_svm = svm.SVC(C=0.1, max_iter=1000)
clf_svm.fit(f_train, train_y)



SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [179]:
# evaluate naive model
svm_prediction = clf_svm.predict(f_test)
accuracy_score(test_y, svm_prediction)

0.6593204775022957

Now try to improve the model estimating hyperparameters

In [201]:
# Converting average precision score into a scorer suitable for model selection
acc_scr = make_scorer(accuracy_score, greater_is_better=True, needs_proba=True)

In [202]:
# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [203]:
clf_svm = svm.SVC(max_iter=10000, probability=True)

Grid Search

In [206]:
# GridSearchCV needs a predefined plan of the experiments
grid_search = GridSearchCV(clf_svm, 
                           param_grid={"C": [0.1, 1, 10],
                                       "kernel": ["linear", "poly", "rbf", "sigmoid"],
                                       "degree": [2, 3, 5],
                                       "decision_function_shape": ["ovo", "ovr"]
                                       },
                           n_jobs=-1,
                           cv=skf,
                           scoring='accuracy',
                           iid=False, # just return the average score across folds
                           return_train_score=False)

best_params = report_perf(grid_search, f_train, train_y, 'GridSearchCV')

GridSearchCV took 102.64 seconds,  candidates checked: 1, best CV score: 0.691 ± 0.018
Best parameters:
{'C': 0.1, 'decision_function_shape': 'ovo', 'degree': 2, 'kernel': 'linear'}



Random Search

In [213]:
# RandomizedSearchCV needs the distribution of the experiments to be tested
# If you can provide the right distribution, the sampling will lead to faster and better results.
random_search = RandomizedSearchCV(clf_svm, 
                                   param_distributions={"C": uniform(0.01, 10),
                                                        "kernel": ["linear", "poly", "rbf", "sigmoid"],
                                                        "degree": [2, 3, 5],
                                                        "decision_function_shape": ["ovo", "ovr"]
                                                       },
                                   n_iter=10,
                                   n_jobs=-1,
                                   cv=skf,
                                   scoring='accuracy',
                                   iid=False, # just return the average score across folds
                                   return_train_score=False,
                                   random_state=0)

best_params = report_perf(random_search, f_train, train_y, 'RandomizedSearchCV')



RandomizedSearchCV took 634.18 seconds,  candidates checked: 10, best CV score: 0.720 ± 0.012
Best parameters:
{'C': 6.468941130666561,
 'decision_function_shape': 'ovo',
 'degree': 5,
 'kernel': 'linear'}





Bayesian Search

In [222]:
search_spaces = {"C": Real(0.01, 1.0),
                 "kernel": ["linear", "poly", "rbf", "sigmoid"],
                 "degree": [2, 3, 5],
                 "decision_function_shape": ["ovo", "ovr"]
                }

for baseEstimator in ['GP', 'RF', 'ET', 'GBRT']:
    opt = BayesSearchCV(clf_svm,
                        search_spaces,
                        scoring='accuracy',
                        cv=skf,
                        n_iter=10,
                        n_jobs=-1,
                        return_train_score=False,
                        optimizer_kwargs={'base_estimator': baseEstimator},
                        random_state=4)
    
    best_params = report_perf(opt, f_train, train_y,'BayesSearchCV_'+baseEstimator)

KeyboardInterrupt: 