# Sentiment analysis

In [1]:
import re
import time

import numpy as np
import torchtext
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [2]:
train_data, test_data = torchtext.datasets.IMDB.splits(
    torchtext.data.Field(tokenize=lambda s: s),
    torchtext.data.Field(tokenize=lambda s: s),
    root='../data/imdb')

train_texts = [s.text for s in train_data]
train_y = np.array([1 if s.label == 'pos' else 0 for s in train_data])

test_texts = [s.text for s in test_data]
test_y = np.array([1 if s.label == 'pos' else 0 for s in test_data])

In [3]:
len(train_texts), len(test_texts)

(25000, 25000)

In [4]:
train_y.shape, test_y.shape

((25000,), (25000,))

In [5]:
train_y, test_y

(array([1, 1, 1, ..., 0, 0, 0]), array([1, 1, 1, ..., 0, 0, 0]))

In [6]:
class SKLearnClassifier:

    def __init__(self, model):
        self.model = model
        self.vectorizer = CountVectorizer()

    def fit(self, train_texts, train_y):
        start = time.time()
        train_texts = [self.preprocess(t) for t in train_texts]
        train_x = self.vectorizer.fit_transform(train_texts)
        print(train_x.shape)
        self.model.fit(train_x, train_y)
        print(f'Finished training in {time.time() - start:.2f}s')

    def predict(self, texts):
        texts = [self.preprocess(t) for t in texts]
        x = self.vectorizer.transform(texts)
        return self.model.predict(x)

    @staticmethod
    def preprocess(text):
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'[\'\-\]!"#()*+,./:;<=>?[^_`{|}~]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.lower()

In [7]:
print(train_texts[0])

The emotional powers and characters of Dominick and Eugene are the things that Hollywood doesn't make anymore. This is one of the most emotional, sensitive, and heart-felt movies that I have ever seen! Roy Liotta, Tom Hulce, and supporting actress Jamie Lee Curtis, deliver Oscar Winning caliber performances! There are not enough words to express how great this movie is. Sure, people who are not into sentimental movies may not care as much as the rest of us about Dominick and Eugene, but for the rest of us, this movie goes right to the heart and sole of compassion and humanity. You will never forget this film, EVER!<br /><br />*****SPOILERS BELOW*****<br /><br />The simple yet eloquent story is masterfully told. Eugene is a med-school intern who faces long hours and a demanding work load at the hospital. His fraternal twin brother Dominick (born 12 minutes earlier) is a little slow and awkward because of brain damage due to a victim of abuse by their father. (A heartbreaking moment when

In [8]:
print(SKLearnClassifier.preprocess(train_texts[0]))

the emotional powers and characters of dominick and eugene are the things that hollywood doesn t make anymore this is one of the most emotional sensitive and heart felt movies that i have ever seen roy liotta tom hulce and supporting actress jamie lee curtis deliver oscar winning caliber performances there are not enough words to express how great this movie is sure people who are not into sentimental movies may not care as much as the rest of us about dominick and eugene but for the rest of us this movie goes right to the heart and sole of compassion and humanity you will never forget this film ever spoilers below the simple yet eloquent story is masterfully told eugene is a med school intern who faces long hours and a demanding work load at the hospital his fraternal twin brother dominick born 12 minutes earlier is a little slow and awkward because of brain damage due to a victim of abuse by their father a heartbreaking moment when this is found out in the film that will leave you in

In [10]:
model = SKLearnClassifier(
    RandomForestClassifier(n_estimators=150, max_depth=25, n_jobs=-1))
model.fit(train_texts, train_y)

train_preds = model.predict(train_texts)
print(accuracy_score(train_y, train_preds))

test_preds = model.predict(test_texts)
print(accuracy_score(test_y, test_preds))

(25000, 74702)
Finished training in 7.45s
0.95496
0.8468


In [12]:
fns = model.vectorizer.get_feature_names()
len(fns)

74702