In [None]:
import nltk
import numpy as np

from nltk.corpus import movie_reviews

nltk.download('movie_reviews')

NUM_SAMPLES = 1000
fileids = np.asarray(movie_reviews.fileids())

idx = np.concatenate([
    np.arange(NUM_SAMPLES // 2), 
    np.arange(len(fileids) - NUM_SAMPLES // 2, len(fileids))
])

print(fileids[idx])

In [None]:
from collections import Counter
from nltk.corpus import movie_reviews

reviews = []
labels = []

for fileid in fileids[idx]:
    reviews.append(movie_reviews.raw(fileid))
    labels.append(fileid.split('/')[0])
    
print(reviews[2])
print(labels[2])
print(Counter(labels))

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from typing import List

def tokenize(sentence: List[str], remove_stopwords: bool = False):
    tokens = word_tokenize(sentence)
    tokens = [token.lower() for token in tokens]
    tokens = ['<s>'] + tokens + ['<e>']
    
    if remove_stopwords:
        tokens = [token for token in tokens if not token in stopwords.words('english')]
    
    return tokens
    
tokens = [tokenize(review, remove_stopwords=True) for review in reviews]
print(tokens[:10])
y = np.asarray([0 if label == 'neg' else 1 for label in labels])
print(y[:10])

In [None]:
from functools import reduce

dictionary = sorted(set(reduce(lambda x, y: x + y, tokens)))
print(dictionary[:10])
encoded = [np.asarray([dictionary.index(token) for token in sentence]) for sentence in tokens]
print(encoded[:10])

In [None]:
X = np.zeros((len(encoded), len(dictionary)))
print(X.shape)

for i in range(len(X)):
    for j in range(len(X[i])):
        X[i,j] = len(np.where(encoded[i] == j)[0])
        
print(X[:10, :10])

In [None]:
np.random.seed(42)

shuffled_idx = np.random.permutation(np.arange(len(X)))
X = X[shuffled_idx]
y = y[shuffled_idx]

train_idx = int(len(X) * 0.8)

train_X = X[:train_idx]
train_y = y[:train_idx]
test_X = X[train_idx:]
test_y = y[train_idx:]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


model = LogisticRegression(penalty='l1', solver='liblinear')
model.fit(train_X, train_y)

train_predictions = model.predict_proba(train_X)[:,1]
train_auc = roc_auc_score(train_y, train_predictions)
print(f'Train AUC: {train_auc:.2f}')

test_predictions = model.predict_proba(test_X)[:,1]
test_auc = roc_auc_score(test_y, test_predictions)
print(f'Test AUC: {test_auc:.2f}')