In [21]:
import os
import pandas as pd
import numpy as np
# from sklearn import metrics
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sentiment.util import print_short_eval, eval
# Importing some classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [22]:
def load_datasets_unlabeled_test():
    dataset = load_files('datasets/review_polarity_competition/reviews_sentoken', shuffle=False)
    docs_train, docs_dev, y_train, y_dev = train_test_split(
        dataset.data, dataset.target, test_size=0.10, random_state=42)
    dirname = "datasets/review_polarity_competition/test_reviews_sentoken"
    test = []
    # I do this to keep the files in numeric order
    for fname in range(len(os.listdir(dirname))):
        fname = str(fname) + ".txt"
        with open(os.path.join(dirname, fname)) as fd:
            test.append(fd.read())
    train = docs_train, y_train
    dev = docs_dev, y_dev
    return train, dev, test

def save_results(fname, labels):
    with open(fname, 'w') as f:
        f.write("Id,Category\n")
        for i,l in enumerate(labels):
            f.write(str(i) + ".txt," + str(l) + "\n")

### Loading Data

In [23]:
train, dev, test = load_datasets_unlabeled_test()

In [32]:
display((len(train), len(train[0])))
display((len(dev), len(dev[0])))
display((len(test)))
train_df = pd.DataFrame({'data': train[0], 'target': train[1]})
dev_df = pd.DataFrame({'data': dev[0], 'target': dev[1]})

(2, 963)

(2, 107)

500

### Dataset description

In [25]:
display(train_df.groupby('target').describe())
display(dev_df.groupby('target').describe())

Unnamed: 0_level_0,data,data,data,data
Unnamed: 0_level_1,count,unique,top,freq
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,481,445,"b""The special effects were okay , but the stor...",3
1,482,481,"b""wayne is in fine form as hondo lane , rider ...",2


Unnamed: 0_level_0,data,data,data,data
Unnamed: 0_level_1,count,unique,top,freq
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,54,54,b'Which is not saying much since this along wi...,1
1,53,53,"b""First of all , you ca n't watch this DVD if ...",1


In [34]:
X_train, y_train = train
X_dev, y_dev = dev
X_test = test

### Setting up classifiers

In [27]:
param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
    'vect__min_df': [1, 3, 5, 7],
    'vect__max_df': [0.95, 0.9, 0.7],
    'clf__random_state': [0],
}

params_list = list(ParameterGrid(param_grid))

vects = [
    CountVectorizer(),
    TfidfVectorizer()
]

clfs = [
#     KNeighborsClassifier(),
#     MultinomialNB(),
#     DecisionTreeClassifier(),
    LogisticRegression(),
    LinearSVC(),
#     SVC(),
#     RandomForestClassifier(),
]

In [28]:
results = []

for vect in vects:
    print(str(vect.__class__))
    for clf in clfs:
        print(str(clf.__class__))
        pipeline = Pipeline([
            ('vect', vect),
            ('clf', clf),
        ])
        for params in params_list:
            print(params)
            pipeline.set_params(**params)
            pipeline.fit(X_train, y_train)
            print_short_eval(pipeline, X_dev, y_dev)
            result0 = eval(pipeline, X_dev, y_dev)
            results.append({
                **result0,
                **params,
            })

<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.95, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}
accuracy	0.86	macro f1	0.86
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.95, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}
accuracy	0.87	macro f1	0.87
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.95, 'vect__min_df': 1, 'vect__ngram_range': (1, 3)}
accuracy	0.87	macro f1	0.87
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.95, 'vect__min_df': 1, 'vect__ngram_range': (1, 4)}
accuracy	0.88	macro f1	0.88
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.95, 'vect__min_df': 1, 'vect__ngram_range': (1, 5)}
accuracy	0.88	macro f1	0.88
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.95, 'vect__min_df': 3, 'vect__ngram_range': (1, 1)}
accuracy	0.87	macro f1	0.87
{'clf__random

accuracy	0.88	macro f1	0.88
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 7, 'vect__ngram_range': (1, 3)}
accuracy	0.88	macro f1	0.88
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 7, 'vect__ngram_range': (1, 4)}
accuracy	0.88	macro f1	0.88
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 7, 'vect__ngram_range': (1, 5)}
accuracy	0.88	macro f1	0.88
<class 'sklearn.svm.classes.LinearSVC'>
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.95, 'vect__min_df': 1, 'vect__ngram_range': (1, 1)}
accuracy	0.83	macro f1	0.83
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.95, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}
accuracy	0.88	macro f1	0.88
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.95, 'vect__min_df': 1, 'vect__ngram_range': (1, 3)}
accuracy	0.86	macro f1	0.86
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.

accuracy	0.89	macro f1	0.89
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 5, 'vect__ngram_range': (1, 5)}
accuracy	0.89	macro f1	0.89
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 7, 'vect__ngram_range': (1, 1)}
accuracy	0.84	macro f1	0.84
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 7, 'vect__ngram_range': (1, 2)}
accuracy	0.89	macro f1	0.89
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 7, 'vect__ngram_range': (1, 3)}
accuracy	0.87	macro f1	0.87
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 7, 'vect__ngram_range': (1, 4)}
accuracy	0.87	macro f1	0.87
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 7, 'vect__ngram_range': (1, 5)}
accuracy	0.87	macro f1	0.87
<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
<class 'sklearn.linear_model.logistic.LogisticRegr

accuracy	0.87	macro f1	0.87
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 5, 'vect__ngram_range': (1, 1)}
accuracy	0.88	macro f1	0.88
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 5, 'vect__ngram_range': (1, 2)}
accuracy	0.89	macro f1	0.89
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 5, 'vect__ngram_range': (1, 3)}
accuracy	0.88	macro f1	0.88
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 5, 'vect__ngram_range': (1, 4)}
accuracy	0.88	macro f1	0.88
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 5, 'vect__ngram_range': (1, 5)}
accuracy	0.88	macro f1	0.88
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 7, 'vect__ngram_range': (1, 1)}
accuracy	0.87	macro f1	0.87
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 7, 'vect__ngram_range': 

accuracy	0.90	macro f1	0.90
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 3, 'vect__ngram_range': (1, 3)}
accuracy	0.90	macro f1	0.90
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 3, 'vect__ngram_range': (1, 4)}
accuracy	0.90	macro f1	0.90
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 3, 'vect__ngram_range': (1, 5)}
accuracy	0.90	macro f1	0.90
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 5, 'vect__ngram_range': (1, 1)}
accuracy	0.85	macro f1	0.85
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 5, 'vect__ngram_range': (1, 2)}
accuracy	0.89	macro f1	0.89
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 5, 'vect__ngram_range': (1, 3)}
accuracy	0.92	macro f1	0.92
{'clf__random_state': 0, 'vect__binary': True, 'vect__max_df': 0.7, 'vect__min_df': 5, 'vect__ngram_range': 

In [29]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__random_state,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
192,0.915888,0,0.915888,True,0.95,5,"(1, 3)"
193,0.915888,0,0.915888,True,0.95,5,"(1, 4)"
194,0.915888,0,0.915888,True,0.95,5,"(1, 5)"
212,0.915888,0,0.915888,True,0.9,5,"(1, 3)"
213,0.915888,0,0.915888,True,0.9,5,"(1, 4)"
214,0.915888,0,0.915888,True,0.9,5,"(1, 5)"
232,0.915888,0,0.915888,True,0.7,5,"(1, 3)"
233,0.915888,0,0.915888,True,0.7,5,"(1, 4)"
234,0.915888,0,0.915888,True,0.7,5,"(1, 5)"
191,0.906542,0,0.906534,True,0.95,5,"(1, 2)"


### Trying the best param configuration found

In [30]:
pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True, ngram_range=(1, 3), max_df=0.95, min_df=5)),
    ('clf', LogisticRegression(random_state=0))
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	1.00	macro f1	1.00
accuracy	0.91	macro f1	0.91


In [31]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True, ngram_range=(1, 3), max_df=0.95, min_df=5)),
    ('clf', LinearSVC(random_state=0))
])
pipeline.fit(X_train, y_train)
print_short_eval(pipeline, X_train, y_train)
print_short_eval(pipeline, X_dev, y_dev)

accuracy	1.00	macro f1	1.00
accuracy	0.92	macro f1	0.92


#### The best model found is a LinearSVC using the TfidfVectorizer, we get an accuracy of 0.92

### Create file for final submission

In [35]:
y_test_pred = pipeline.predict(X_test)
submission = y_test_pred
fname = 'datasets/review_polarity_competition/submission.csv'
save_results(fname, submission)