In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir('../../notebook_format')
from formats import load_style
load_style()

In [2]:
os.chdir(path)

import shutil
import numpy as np
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
from sklearn.svm import LinearSVC
from nltk.stem.porter import PorterStemmer
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Sentiment Analysis

Kaggle knowledge competition: movie review sentiment analysis. [homepage](https://www.kaggle.com/c/word2vec-nlp-tutorial)

In [3]:
train = pd.read_csv( 'labeledTrainData.tsv', delimiter = '\t' )
test  = pd.read_csv( 'testData.tsv', delimiter = '\t' )
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
# remove tags or markup (e.g. <br> <\br> )
train['review'] = train['review'].apply( lambda s: BeautifulSoup(s).get_text() )
test['review']  = test['review'].apply(  lambda s: BeautifulSoup(s).get_text() )

# extract the training and testing's data/label
X_train = train['review']
y_train = train['sentiment']
X_test  = test['review']

## First Try 

Use the most simplest approach: remove english stopwords, bag of words + default logistic regression and naive bayes.

In [6]:
# convert both sets' text column to document-term matrix
vect1 = CountVectorizer( stop_words = 'english' )
X_train_dtm = vect1.fit_transform(X_train)
X_test_dtm  = vect1.transform(X_test)

In [None]:
# train two popular model for text classification
# 1. mutinomial naive bayes model
# 2. logistic regression, 10-fold cross validation to determine 
# the best regularization `C`

# tune the regularization/error term for logistic regression
param_dict = { 'C': [ 0.001, 0.01, 0.1, 1, 10, 25 ] }
    
logreg = LogisticRegression()    
grid_logreg = GridSearchCV( 
    logreg, 
    param_grid = param_dict, 
    cv = 10,
    scoring = 'roc_auc',
    n_jobs = -1,
    verbose = 1
)
    
nb = MultinomialNB()

models = [ nb, grid_logreg ]

In [8]:
def train_models( models, X_train, y_train ):
    """
    Train a bunch of models
       
    TODO : add more models
    
    Parameters
    ----------
    models : list
        a list of specified models to train
    
    X_train, y_train : numpy array
        training data and its label
        
    Returns
    -------
    models_dict : dictionary
        keys are the model's name, values are the corresponding trained models object.
        To be exact, if the model uses default parameters, then the key will only be 
        the model's name, but if the model uses grid/random 
        search to find the best possible parameter, then the best parameters will
        also be tagged along with the model's name. e.g. LogisticRegression-C_0.1,
        means the best parameter is C 0.1 for the logistic regression
    """
    
    # train all the models
    # for scikit learn model, it's simply .fit
    for model in models:
        model.fit( X_train, y_train )
        
    # get the models' name
    model_names = []
    for model in models:
        model_name = model.__class__.__name__

        # if it's a sklearn's CV model, search deeper for model name
        # and it's best estimated parameter            
        if model_name in ( 'GridSearchCV', 'RandomizedSearchCV' ):
                        
            # best parameter obtained by the grid search
            strings = []
            params = model.best_params_           
            for key, value in params.items():
                string = str(key) + ' = ' + str(value)
                strings.append(string)
            
            # concatenate the model's name and the best parameter
            param_strings = ', '.join(strings)
            best_model = model.best_estimator_
            model_name = best_model.__class__.__name__ + ': ' + param_strings
        
        model_names.append(model_name)
        
    models_dict = { name: model for name, model in zip( model_names, models ) }
    return models_dict

In [9]:
models_dict = train_models( models = models, X_train = X_train_dtm, y_train = y_train )
models_dict

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.1min finished


In [10]:
def predict_probabilities( models_dict, X_test, write_submission = False, 
                           test_id = None, column_names = [ 'id', 'predictions' ] ):
    """
    For each model, predict the probability of the test set
    and store the result in one single dataframe. In the mean time, you
    can also specify whether to write the prediction result to .csv files
    
    TODO 1: make it work with models from other commonly used library
    TODO 2: extend it to muti-class problems
    
    Parameters
    ----------
    models_dict : dictionary
        dictionary of already fitted models, where the keys are the model's name
        value is the already trained model object
        
    X_test : np-array
        test data of the fitted models
        
    write_submission : boolean (default: False)
        if True, then each model's prediction will be written to a .csv file,
        all of which are stored in a 'submission' + datetime folder (overwrite if the folder
        already exists). Each .csv file will contain two column, the first being
        the test set's id and the second being the model's predicted probability
    
    test_id : np-array
        specify test set's id if write_submission is True
    
    column_names : list/tuple, default: [ 'id', 'predictions' ]
        column name for the submission
        
    Returns
    -------
    predictions : DataFrame
        each column is the predicted probability of the test data for each model
    """
    
    # create a folder name 'submission' + datetime to store the predicted result
    # it'll over-write the existing file if it exists
    if write_submission:
        folder = 'submission' + str( datetime.now() )
        if os.path.exists(folder):
            shutil.rmtree(folder)
        os.mkdir(folder)
    
    predictions = np.zeros( ( X_test.shape[0], len(models_dict) ) )   
    for index, ( name, model ) in enumerate( models_dict.items() ):        
        pred = model.predict_proba(X_test)[ :, 1 ]
        predictions[ :, index ] = pred
        
        if write_submission:
            output = pd.DataFrame({ 'id': test_id, 'prediction': pred })
            output.columns = column_names
            output.to_csv( os.path.join( folder, name + '.csv' ), index = False )
    
    predictions = pd.DataFrame( predictions, columns = list( models_dict.keys() ) )    
    return predictions

In [11]:
# logistic regression 0.94127
# naive bayes 0.89415

# if `write_submission` is False then you do not need to specify
# the `test_id` nor `column_names`
predictions = predict_probabilities( 
    models_dict = models_dict, 
    X_test = X_test_dtm,
    write_submission = True,
    test_id = test['id'],
    column_names = [ 'id', 'sentiment' ]
)
predictions.head(3)

Unnamed: 0,MultinomialNB,LogisticRegression-C_0.1
0,1.0,0.98077
1,8.334912e-10,0.004839
2,0.03846241,0.62137


## Second Try

Include n-gram (1 and 2-gram to be exact), stemming (e.g. running will be converted to run) and use cross validation to determine the best regularization for logistic regression.

In [12]:
porter = PorterStemmer()
def tokenizer_porter(text):
    """pass into the tokenizer argument of Count"""
    stemmed = [ porter.stem(word) for word in text.split() ]
    return stemmed

In [13]:
vect2 = CountVectorizer( 
    stop_words = 'english', 
    tokenizer = tokenizer_porter, 
    ngram_range = ( 1, 2 ),
    min_df = 2
)
X_train_dtm = vect2.fit_transform(X_train)
X_test_dtm  = vect2.transform(X_test)

In [14]:
models_dict = train_models( X_train = X_train_dtm, y_train = y_train )

# logistic regression 0.94474
# naive bayes 0.92292
predictions = predict_probabilities( 
    models_dict = models_dict, 
    X_test = X_test_dtm,
    write_submission = True,
    test_id = test['id'],
    column_names = [ 'id', 'sentiment' ]
)
predictions.head(3)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  3.0min finished


Unnamed: 0,MultinomialNB,LogisticRegression-C_1
0,1.0,0.999129
1,6.183532e-13,0.005406
2,0.9991036,0.980235


## Third Try

Use tf-idf instead of bag of words.

In [15]:
tf_vect1 = TfidfVectorizer( 
    stop_words = 'english', 
    tokenizer = tokenizer_porter, 
    ngram_range = ( 1, 2 ),
    min_df = 2
)
X_train_dtm = tf_vect1.fit_transform(X_train)
X_test_dtm  = tf_vect1.transform(X_test)

In [16]:
models_dict = train_models( X_train = X_train_dtm, y_train = y_train )

# logistic regression 0.95386
# naive bayes 0.93842
predictions = predict_probabilities( 
    models_dict = models_dict, 
    X_test = X_test_dtm,
    write_submission = True,
    test_id = test['id'],
    column_names = [ 'id', 'sentiment' ]
)
predictions.head(3)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   33.3s finished


Unnamed: 0,MultinomialNB,LogisticRegression-C_25
0,0.86418,0.996484
1,0.208051,0.037363
2,0.605011,0.902219


In [None]:
# TODO: stacking?

In [24]:
# manually weighted average : 0.95501
pred = np.average( predictions.values, axis = 1, weights = ( 0.4, 0.6 ) )
output = pd.DataFrame({ 'id': test['id'], 'sentiment': pred })
output.to_csv( 'test.csv', index = False )

In [18]:
# from sklearn import metrics
# from sklearn.cross_validation import train_test_split

## Reference

- [Kaggle Use Google's Word2Vec for movie reviews](https://www.kaggle.com/c/word2vec-nlp-tutorial)
- [Blog on Natural Language Processing in a Kaggle Competition for Movie Reviews](https://jessesw.com/NLP-Movie-Reviews/)

https://www.kaggle.com/c/word2vec-nlp-tutorial/forums/t/14966/post-competition-solutions