In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir('../../notebook_format')
from formats import load_style
load_style()

In [2]:
os.chdir(path)

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Sentiment Analysis

Kaggle knowledge competition: movie review sentiment analysis. [homepage](https://www.kaggle.com/c/word2vec-nlp-tutorial)

In [3]:
train = pd.read_csv( 'labeledTrainData.tsv', delimiter = '\t' )
test  = pd.read_csv( 'testData.tsv', delimiter = '\t' )
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [72]:
# remove tags or markup (e.g. <br> <\br> )
train['review'] = train['review'].apply( lambda s: BeautifulSoup(s).get_text() )
test['review']  = test['review'].apply( lambda s: BeautifulSoup(s).get_text() )

# extract the training and testing's data/label
X_train = train['review']
y_train = train['sentiment']
X_test  = test['review']

## First Try 

Use the most simplest approach: remove english stopwords, bag of words + default logistic regression and naive bayes.

In [5]:
# convert both sets' text column to document-term matrix
vect1 = CountVectorizer( stop_words = 'english' )
X_train_dtm = vect1.fit_transform(X_train)
X_test_dtm  = vect1.transform(X_test)

# train the models
nb = MultinomialNB()
nb.fit( X_train_dtm, y_train )
logreg = LogisticRegression()
logreg.fit( X_train_dtm, y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
# naive bayes 0.89415
pred_nb = nb.predict_proba(X_test_dtm)[ :, 1 ]
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred_nb })
output.to_csv( 'nb.csv', index = False )

# logistic regression 0.92943
pred_logreg = logreg.predict_proba(X_test_dtm)[ :, 1 ]
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred_logreg })
output.to_csv( 'logreg.csv', index = False )

## Second Try

Include n-gram (1 and 2-gram to be exact), stemming (e.g. running will be converted to run) and use cross validation to determine the best regularization for logistic regression.

In [7]:
porter = PorterStemmer()
def tokenizer_porter(text):
    """pass into the tokenizer argument of Count"""
    stemmed = [ porter.stem(word) for word in text.split() ]
    return stemmed

In [8]:
def create_model( X_train, y_train ):
    """
    Pass in the training data and its label, train a logistic regression and 
    naive bayes model, where 10-fold cross validation is used to determine 
    the best regularization for logistic regression
    
    TODO: this should probably return a dictionary or a list or a namedtuple
    of trained models
    """
    logreg = LogisticRegression()
    param_grid = { 'C': [ 0.001, 0.01, 0.1, 1, 10, 25 ] }
    grid_logreg = GridSearchCV( 
        logreg, 
        param_grid = param_grid, 
        cv = 10,
        scoring = 'roc_auc',
        n_jobs = -1,
        verbose = 1
    )
    grid_logreg.fit( X_train, y_train )
    
    nb = MultinomialNB()
    nb.fit( X_train, y_train )
    
    return grid_logreg, nb

In [9]:
vect2 = CountVectorizer( 
    stop_words = 'english', 
    tokenizer = tokenizer_porter, 
    ngram_range = ( 1, 2 ),
    min_df = 2
)
X_train_dtm = vect2.fit_transform(X_train)
X_test_dtm  = vect2.transform(X_test)

In [10]:
grid_logreg1, nb1 = create_model( X_train = X_train_dtm, y_train = y_train )

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.8min finished


In [11]:
print(grid_logreg1.best_score_)
print(grid_logreg1.best_params_)

# 0.94474
pred_logreg = grid_logreg1.best_estimator_.predict_proba(X_test_dtm)[ :, 1 ]
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred_logreg })
output.to_csv( 'logreg.csv', index = False )

# 0.92292
pred2 = nb1.predict_proba(X_test_dtm)[ :, 1 ]
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred2 })
output.to_csv( 'nb.csv', index = False )

0.948118976
{'C': 1}


## Third Try

Use tf-idf instead of bag of words.

In [12]:
tf_vect1 = TfidfVectorizer( 
    stop_words = 'english', 
    tokenizer = tokenizer_porter, 
    ngram_range = ( 1, 2 ),
    min_df = 2
)
X_train_dtm = tf_vect1.fit_transform(X_train)
X_test_dtm  = tf_vect1.transform(X_test)

In [13]:
grid_logreg2, nb2 = create_model( X_train = X_train_dtm, y_train = y_train )

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   32.1s finished


In [14]:
print(grid_logreg2.best_score_)
print(grid_logreg2.best_params_)

# 0.95386
pred1 = grid_logreg2.best_estimator_.predict_proba(X_test_dtm)[ :, 1 ]
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred1 })
output.to_csv( 'logreg.csv', index = False )

# 0.93842
pred2 = nb2.predict_proba(X_test_dtm)[ :, 1 ]
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred2 })
output.to_csv( 'nb.csv', index = False )

0.958878528
{'C': 25}


In [100]:
def predict_probabilities( models, X_test ):
    """
    For each model, predict the probability of the test set
    and store the result in one single dataframe
    TODO : make it work with models from other commonly used library
    
    Parameters
    ----------
    models : list
        already fitted models
        
    X_test : 
        test data of the fitted models
        
    Returns
    -------
    predictions : DataFrame
        each column is the predicted probability of the test data for each model
    """
    col_names = []
    predictions = np.zeros( ( X_test.shape[0], len(models) ) )   
    
    for index, model in enumerate(models):
        
        # if it's a sklearn's CV model, search deeper for model name
        # and it's best estimated parameter
        model_name = model.__class__.__name__
        
        if model_name in ( 'GridSearchCV', 'RandomizedSearchCV' ):
            best_model = model.best_estimator_
            pred = best_model.predict_proba(X_test)[ :, 1 ]
            
            # obtain the best parameter obtained by the grid search
            strings = []
            params = model.best_params_           
            for key, value in params.items():
                string = str(key) + '-' + str(value)
                strings.append(string)
            
            # concatenate the model's name and the best parameter
            param_strings = '_'.join(strings)
            col_name = best_model.__class__.__name__ + '_' + param_strings             
        else:
            pred = model.predict_proba(X_test)[ :, 1 ]
            col_name = model_name
        
        predictions[ :, index ] = pred
        col_names.append(col_name)
    
    predictions = pd.DataFrame( predictions, columns = col_names )    
    return predictions

In [16]:
models = [ grid_logreg2, nb2 ]
predictions = predict_probabilities( models = models, X_test = X_test_dtm )

In [17]:
# manually weighted average : 0.95508
pred = np.average( predictions.values, axis = 1, weights = ( 0.7, 0.3 ) )
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred })
output.to_csv( 'test.csv', index = False )

## Adding More Data

In [90]:
tweets = pd.read_csv('Tweets.csv')

# retain only the positive and negative text
sub_tweets = tweets[ tweets['airline_sentiment'].isin( [ 'positive', 'negative' ] ) ]
sub_tweets = sub_tweets[[ 'text', 'airline_sentiment' ]]

# change the column and output label to match the original data
sub_tweets.columns = [ 'review', 'sentiment' ]
sub_tweets['sentiment'] = sub_tweets['sentiment'].map({ 'positive': 1, 'negative': 0 })
sub_tweets.reset_index( inplace = True, drop = True )

In [91]:
import re

def remove_hashtag(text):
    clean_text = [ t for t in text.split() if not t.startswith('@') ]
    clean_text = ' '.join(clean_text)
    return clean_text

In [92]:
sub_tweets['review'] = sub_tweets['review'].apply(remove_hashtag)

In [93]:
X_train = train[['review']]
y_train = train[['sentiment']]
X_train = X_train.append(sub_tweets[['review']])
y_train = y_train.append(sub_tweets[['sentiment']])

X_train = X_train['review']
y_train = y_train['sentiment']
X_test  = test['review']

In [94]:
X_train_dtm = tf_vect1.fit_transform(X_train)
X_test_dtm  = tf_vect1.transform(X_test)

In [97]:
grid_logreg3, nb3 = create_model( X_train = X_train_dtm, y_train = y_train )

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   32.6s finished


In [98]:
models = [ grid_logreg3, nb3 ]
predictions = predict_probabilities( models = models, X_test = X_test_dtm )

In [99]:
pred1 = grid_logreg3.best_estimator_.predict_proba(X_test_dtm)[ :, 1 ]
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred1 })
output.to_csv( 'logreg.csv', index = False )

# 0.93842
pred2 = nb3.predict_proba(X_test_dtm)[ :, 1 ]
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred2 })
output.to_csv( 'nb.csv', index = False )

In [None]:
pred = np.average( predictions.values, axis = 1, weights = ( 0.7, 0.3 ) )
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred })
output.to_csv( 'test.csv', index = False )

In [19]:
# from sklearn import metrics
# from sklearn.cross_validation import train_test_split

## Reference

- [Kaggle Use Google's Word2Vec for movie reviews](https://www.kaggle.com/c/word2vec-nlp-tutorial)
- [Blog on Natural Language Processing in a Kaggle Competition for Movie Reviews](https://jessesw.com/NLP-Movie-Reviews/)

https://www.kaggle.com/c/word2vec-nlp-tutorial/forums/t/14966/post-competition-solutions

https://github.com/mesnilgr/nbsvm

https://github.com/vivekn/sentiment

https://github.com/vsl9/Sentiment-Analysis-with-Convolutional-Networks