# Sentiment analysis

## Links 
https://github.com/scikit-learn/scikit-learn/blob/master/doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py

In [2]:
import pathlib
import numpy as np

import warnings
warnings.filterwarnings('ignore')
import pandas as pd

from pandas.api.types import CategoricalDtype

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

## Get the dataset

Display the directory containing the languages data

In [3]:
!ls data/movie_reviews/txt_sentoken/

neg  pos


Setup the language folder and the language code

In [4]:
movie_reviews_data_folder = pathlib.Path('.') / 'data' / 'movie_reviews' / 'txt_sentoken'

Load the dataset in the scikit format (parent directory with each category in child directory)

In [5]:
dataset = load_files(movie_reviews_data_folder)

Display information about the dataset

In [6]:
pd.Series({"number_of_documents": len(dataset.data), "sentiment": dataset.target_names})

number_of_documents          2000
sentiment              [neg, pos]
dtype: object

Display the number of documents for each sentiment

In [8]:
sentiment_map = {idx: lang for idx, lang in enumerate(dataset.target_names)}
pd.Categorical(dataset.target).rename_categories(sentiment_map).value_counts()

neg    1000
pos    1000
dtype: int64

Display an example of a negative movie review

In [9]:
neg_idx = np.nonzero(dataset.target == 0)[0][0]
print(dataset.data[neg_idx].decode('ascii')[:200])

arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . 
it's hard seeing arnold as mr .


Display an example of a positive movie review

In [10]:
pos_idx = np.nonzero(dataset.target == 1)[0][0]
print(dataset.data[pos_idx].decode('ascii')[:200])

good films are hard to find these days . 
great films are beyond rare . 
proof of life , russell crowe's one-two punch of a deft kidnap and rescue thriller , is one of those rare gems . 
a taut drama 


Split the data into training and test set

In [11]:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=None)

## Create the text features

Use a td-idf vectorizer removing too rare or too common terms

In [12]:
vectorizer = TfidfVectorizer(min_df=3, max_df=0.95)

Create a classifier pipeline

In [14]:
pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', LinearSVC(C=1000)),
])

Setup grid search

In [31]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, scoring='accuracy')
grid_search.fit(docs_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

Predict the sentiment using the training dataset

In [32]:
y_predicted = grid_search.predict(docs_test)

## Print the classification report

In [33]:
print(metrics.classification_report(
    y_test, y_predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

        neg       0.92      0.85      0.88       265
        pos       0.85      0.91      0.88       235

avg / total       0.88      0.88      0.88       500



Print the confusion matrix

In [34]:
metrics.confusion_matrix(y_test, y_predicted)

array([[226,  39],
       [ 20, 215]])

Display the mean and std of the score (default accuracy) of each cross-validated run

In [35]:
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
    print(i, 'params - %s; mean - %0.2f; std - %0.2f'
        % (grid_search.cv_results_['params'][i],
           grid_search.cv_results_['mean_test_score'][i],
grid_search.cv_results_['std_test_score'][i]))

0 params - {'vect__ngram_range': (1, 1)}; mean - 0.84; std - 0.01
1 params - {'vect__ngram_range': (1, 2)}; mean - 0.86; std - 0.01
