# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import mixture
import nltk
from nltk.tokenize import word_tokenize
from string import punctuation
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.model_selection import train_test_split

# Helper functions

In [11]:
stemmer = nltk.stem.snowball.SnowballStemmer('english')

stopwords = nltk.corpus.stopwords.words('english')

num_re = re.compile(r'\d+')
words_re = re.compile(r'\w+')
html_re = re.compile('<.*?>')

def prepare_en(text: str) -> List[str]:
    tokens = word_tokenize(text.lower())
    tokens = [re.sub(html_re, '', t) for t in tokens]
    tokens = [stemmer.stem(t) for t in tokens]
    tokens = [t for t in tokens if t not in punctuation]
    tokens = [t for t in tokens if t not in stopwords]
    tokens = [t for t in tokens if num_re.match(t) is None]
    tokens = [t for t in tokens if words_re.match(t) is not None]

    return ' '.join(tokens)

# Data import

## Read

In [4]:
df = pd.read_csv('./data/imdb.zip')

df['sentiment'] = df['sentiment'].apply(lambda x: 1. if x == 'positive' else 0.)

df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1.0
1,A wonderful little production. <br /><br />The...,1.0
2,I thought this was a wonderful way to spend ti...,1.0
3,Basically there's a family where a little boy ...,0.0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1.0
...,...,...
49995,I thought this movie did a down right good job...,1.0
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0.0
49997,I am a Catholic taught in parochial elementary...,0.0
49998,I'm going to have to disagree with the previou...,0.0


## Preprocess text

In [12]:
df['text_preprocessed'] = df['review'].apply(prepare_en)

df

Unnamed: 0,review,sentiment,text_preprocessed
0,One of the other reviewers has mentioned that ...,1.0,one review mention watch oz episod hook right ...
1,A wonderful little production. <br /><br />The...,1.0,wonder littl product br br film techniqu veri ...
2,I thought this was a wonderful way to spend ti...,1.0,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,0.0,basic famili littl boy jake think zombi closet...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1.0,petter mattei love time money visual stun film...
...,...,...,...
49995,I thought this movie did a down right good job...,1.0,thought movi right good job n't creativ origin...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0.0,bad plot bad dialogu bad act idiot direct anno...
49997,I am a Catholic taught in parochial elementary...,0.0,cathol taught parochi elementari school nun ta...
49998,I'm going to have to disagree with the previou...,0.0,go disagre previous comment side maltin one se...


## Save preprocessed

In [13]:
df.to_csv('./data/preprocessed_en.csv', index=False)

## Load preprocessed

In [14]:
df = pd.read_csv('./data/preprocessed_en.csv')

df

Unnamed: 0,review,sentiment,text_preprocessed
0,One of the other reviewers has mentioned that ...,1.0,one review mention watch oz episod hook right ...
1,A wonderful little production. <br /><br />The...,1.0,wonder littl product br br film techniqu veri ...
2,I thought this was a wonderful way to spend ti...,1.0,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,0.0,basic famili littl boy jake think zombi closet...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1.0,petter mattei love time money visual stun film...
...,...,...,...
49995,I thought this movie did a down right good job...,1.0,thought movi right good job n't creativ origin...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0.0,bad plot bad dialogu bad act idiot direct anno...
49997,I am a Catholic taught in parochial elementary...,0.0,cathol taught parochi elementari school nun ta...
49998,I'm going to have to disagree with the previou...,0.0,go disagre previous comment side maltin one se...


## Vectorize

In [18]:
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, min_df=.01)

vectorized = vectorizer.fit_transform(df['text_preprocessed']).toarray()

vectorizer.get_feature_names_out().shape

(1562,)

In [20]:
vectorizer.get_feature_names_out()

array(['.', 'abandon', 'abil', ..., 'youth', 'zero', 'zombi'],
      dtype=object)

# Clustering

## GMM

In [21]:
true_fake_model = mixture.GaussianMixture(2)

true_fake_model.fit(vectorized)

### Save model

In [22]:
gmm_name = './models/gmm_en'
np.save(gmm_name + '_weights', true_fake_model.weights_, allow_pickle=False)
np.save(gmm_name + '_means', true_fake_model.means_, allow_pickle=False)
np.save(gmm_name + '_covariances', true_fake_model.covariances_, allow_pickle=False)

### Load the model

In [23]:
means = np.load('./models/gmm_en' + '_means.npy')
covar = np.load('./models/gmm_en' + '_covariances.npy')
true_fake_model = mixture.GaussianMixture(n_components = len(means), covariance_type='full')
true_fake_model.precisions_cholesky_ = np.linalg.cholesky(np.linalg.inv(covar))
true_fake_model.weights_ = np.load('./models/gmm_en' + '_weights.npy')
true_fake_model.means_ = means
true_fake_model.covariances_ = covar

### Predict and add col

In [24]:
gmm_out = true_fake_model.predict(vectorized)

df['gmm_out'] = gmm_out
df['gmm_out'] = df['gmm_out']

df

Unnamed: 0,review,sentiment,text_preprocessed,gmm_out
0,One of the other reviewers has mentioned that ...,1.0,one review mention watch oz episod hook right ...,False
1,A wonderful little production. <br /><br />The...,1.0,wonder littl product br br film techniqu veri ...,False
2,I thought this was a wonderful way to spend ti...,1.0,thought wonder way spend time hot summer weeke...,True
3,Basically there's a family where a little boy ...,0.0,basic famili littl boy jake think zombi closet...,True
4,"Petter Mattei's ""Love in the Time of Money"" is...",1.0,petter mattei love time money visual stun film...,True
...,...,...,...,...
49995,I thought this movie did a down right good job...,1.0,thought movi right good job n't creativ origin...,True
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0.0,bad plot bad dialogu bad act idiot direct anno...,True
49997,I am a Catholic taught in parochial elementary...,0.0,cathol taught parochi elementari school nun ta...,False
49998,I'm going to have to disagree with the previou...,0.0,go disagre previous comment side maltin one se...,False


In [25]:
len(df[df['sentiment'] == df['gmm_out']]) / len(df)

0.49946

## K-means

In [26]:
true_false_model_km = sklearn.cluster.KMeans(n_clusters = 2, init = 'k-means++')
kmeans_out = true_false_model_km.fit_predict(vectorized)

In [27]:
kmeans_out

array([1, 1, 1, ..., 0, 0, 0], dtype=int32)

In [28]:
df['km_out'] = kmeans_out
df['km_out'] = df['km_out']

df

Unnamed: 0,review,sentiment,text_preprocessed,gmm_out,km_out
0,One of the other reviewers has mentioned that ...,1.0,one review mention watch oz episod hook right ...,False,True
1,A wonderful little production. <br /><br />The...,1.0,wonder littl product br br film techniqu veri ...,False,True
2,I thought this was a wonderful way to spend ti...,1.0,thought wonder way spend time hot summer weeke...,True,True
3,Basically there's a family where a little boy ...,0.0,basic famili littl boy jake think zombi closet...,True,True
4,"Petter Mattei's ""Love in the Time of Money"" is...",1.0,petter mattei love time money visual stun film...,True,True
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,1.0,thought movi right good job n't creativ origin...,True,True
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0.0,bad plot bad dialogu bad act idiot direct anno...,True,True
49997,I am a Catholic taught in parochial elementary...,0.0,cathol taught parochi elementari school nun ta...,False,False
49998,I'm going to have to disagree with the previou...,0.0,go disagre previous comment side maltin one se...,False,False


In [30]:
len(df[df['sentiment'] == df['km_out']]) / len(df)

0.48768

# Classification

## Train-test-split etc

In [31]:
df_train, df_test = train_test_split(df, train_size=.7, shuffle=True, random_state=42)

len(df_train), len(df_test)

(35000, 15000)

In [32]:
vectorizer_classification = TfidfVectorizer(tokenizer=word_tokenize, min_df=.001)

X_train = vectorizer_classification.fit_transform(df_train['text_preprocessed'])
y_train = df_train['sentiment'].to_numpy(dtype=float)

X_test = vectorizer_classification.transform(df_test['text_preprocessed'])
y_test = df_test['sentiment'].to_numpy(dtype=float)



## Logistic regression

In [33]:
lr_model = sklearn.linear_model.LogisticRegression(solver='liblinear')
lr_model.fit(X_train, y_train)
lr_pred =  lr_model.predict(X_test)

In [34]:
sklearn.metrics.accuracy_score(y_test, lr_pred)

0.8886

## SVM

### Linear kernel

In [40]:
svm_model1 = sklearn.svm.SVC(kernel='linear')
svm_model1.fit(X_train[:10000], y_train[:10000])
svm1_pred = svm_model1.predict(X_test)

sklearn.metrics.accuracy_score(y_test, svm1_pred)

0.8732

### Default kernel

In [44]:
svm_model2 = sklearn.svm.SVC(kernel='linear')
svm_model2.fit(X_train[:10000], y_train[:10000])
svm2_pred = svm_model1.predict(X_test)

sklearn.metrics.accuracy_score(y_test, svm2_pred)

0.8732

### Degree

In [45]:
svm_model3 = sklearn.svm.SVC(kernel='poly')
svm_model3.fit(X_train[:10000], y_train[:10000])
svm3_pred = svm_model3.predict(X_test)

sklearn.metrics.accuracy_score(y_test, svm3_pred)

0.8431333333333333