# Sentiment analysis

In [1]:
import time

import numpy as np
import torchtext
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
train_data, test_data = torchtext.datasets.IMDB.splits(
    torchtext.data.Field(tokenize=lambda s: s),
    torchtext.data.Field(tokenize=lambda s: s),
    root='../data/imdb')

train_texts, val_texts, train_y, val_y = train_test_split(
    [s.text for s in train_data],
    np.array([1 if s.label == 'pos' else 0 for s in train_data]),
    test_size=0.1)

test_texts = [s.text for s in test_data]
test_y = np.array([1 if s.label == 'pos' else 0 for s in test_data])

In [3]:
len(train_texts), len(val_texts), len(test_texts)

(22500, 2500, 25000)

In [4]:
train_y.shape, val_y.shape, test_y.shape

((22500,), (2500,), (25000,))

In [5]:
train_y, val_y, test_y

(array([1, 0, 1, ..., 0, 0, 0]),
 array([1, 1, 1, ..., 1, 0, 0]),
 array([1, 1, 1, ..., 0, 0, 0]))

In [6]:
np.unique(train_y, return_counts=True), np.unique(val_y, return_counts=True), np.unique(test_y, return_counts=True)

((array([0, 1]), array([11235, 11265])),
 (array([0, 1]), array([1265, 1235])),
 (array([0, 1]), array([12500, 12500])))

In [7]:
class SKLearnClassifier:

    def __init__(self, model):
        self.model = model
        self.vectorizer = CountVectorizer(strip_accents='ascii', lowercase=True)

    def fit(self, train_texts, train_y):
        start = time.time()
        train_x = self.vectorizer.fit_transform(train_texts)
        print(train_x.shape)
        self.model.fit(train_x, train_y)
        print(f'Finished training in {time.time() - start:.2f}s')

    def predict(self, texts):
        x = self.vectorizer.transform(texts)
        return self.model.predict(x)

In [8]:
model = SKLearnClassifier(
    RandomForestClassifier(n_estimators=150, max_depth=25, n_jobs=-1))
model.fit(train_texts, train_y)

train_preds = model.predict(train_texts)
print(accuracy_score(train_y, train_preds))

test_preds = model.predict(test_texts)
print(accuracy_score(test_y, test_preds))

(22500, 71742)
Finished training in 4.82s
0.9547555555555556
0.84948
