<a href="https://colab.research.google.com/github/chekhovana/courses/blob/main/ml_stepik/6_final_project/week4_online_app/model/imdb_classification_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Install and import libraries

In [None]:
!pip install gensim==4.0.0

In [None]:
import re
import os
import random
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

from xgboost import XGBClassifier

# import nltk

from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

#Prepare data

##Load dataset

In [None]:
url = "https://github.com/chekhovana/courses/raw/main/ml_stepik/6_final_project/week4_online_app/model/data/imdb_preprocessed.csv"
df = pd.read_csv(url)

##Create train dataset for exploration purposes

When evaluating different vectorization and classification algorithms, to speed up the training process, limited dataset of 5000 records will be used

In [None]:
n_train = 5000
df_train = df[:n_train]
x_train, y_train = df_train['review'].values, df_train['label'].values

Check train sample for class balance

In [None]:
print('class ratio', np.sum(y_train) / len(y_train))

class ratio [0.503]


The sample is balanced

#Select model

##Compare classifiers

Compare different classifiers by accuracy. The dataset of limited size of 5000 records is used to speed up training process. Text is vectorized with 'bag of words' algorithm with maximum number of features set to 10000.

In [None]:
%%time
max_features = 10000


def compare_models(x, y):
    vectorizer = CountVectorizer(max_features=max_features)
    x = vectorizer.fit_transform(x)
    models = {}
    models['LogisticRegression'] = LogisticRegression(max_iter=1000)
    models['SVC(kernel="rbf")'] = SVC()
    models['SVC(kernel="linear)'] = SVC(kernel='linear')
    models['RandomForestClassifier'] = RandomForestClassifier()
    models['MultinomialNB'] = MultinomialNB()
    for mname, model in models.items():
        score = cross_val_score(model, x, y, cv=3).mean()
        print(mname, round(score, 4))

compare_models(x_train, y_train)

LogisticRegression 0.8394
SVC(kernel="rbf") 0.8198
SVC(kernel="linear) 0.8166
RandomForestClassifier 0.8342
MultinomialNB 0.8302
CPU times: user 1min 9s, sys: 1.25 s, total: 1min 11s
Wall time: 1min 9s


###Conclusion

The best classifier is LogisticRegression, it will be used in the rest of this notebook

In [None]:
model = LogisticRegression(max_iter=1000)

##Compare vectorization approaches

### CountVectorizer and TfidfVectorizer with different ranges of n-gram extraction

In [None]:
def compare_vectorizers(x, y):
    vectorizers = {}
    for vectorizer in (CountVectorizer, TfidfVectorizer):
        for ngram in range(1, 4):
            vname = f'{vectorizer.__name__}(ngram_range=(1, {ngram}))'
            vectorizers[vname] = vectorizer(max_features=max_features, 
                                            ngram_range=(1, ngram))

    for vname, vectorizer in vectorizers.items():
        x_train_vectorized = vectorizer.fit_transform(x)
        score = round(cross_val_score(model, x_train_vectorized, y).mean(), 4)
        print(vname, score)

compare_vectorizers(x_train, y_train)

CountVectorizer(ngram_range=(1, 1)) 0.8412
CountVectorizer(ngram_range=(1, 2)) 0.8434
CountVectorizer(ngram_range=(1, 3)) 0.8418
TfidfVectorizer(ngram_range=(1, 1)) 0.8618
TfidfVectorizer(ngram_range=(1, 2)) 0.8596
TfidfVectorizer(ngram_range=(1, 3)) 0.8598


### Gensim implementation of n-gram extraction

Using gensim, extract bigrams and trigrams and feed obtained text to vectorizers - CountVectorizer and TfidfVectorizer

In [None]:
def extract_trigrams(x):
    x = [o.split() for o in x]

    params = dict(min_count=1, threshold=1, 
                  connector_words=ENGLISH_CONNECTOR_WORDS)
    
    bigram_model = Phrases(x, **params)
    bigrams = bigram_model[x]

    trigram_model = Phrases(bigrams, **params)
    trigrams = trigram_model[bigrams]
    return [' '.join(t) for t in trigrams]    

def evaluate_gensim_trigrams(x_train, y_train):
    x_train = extract_trigrams(x_train)
    vectorizers = {}
    for vectorizer in (CountVectorizer, TfidfVectorizer):
        vname = vectorizer.__name__
        vectorizers[vname] = vectorizer(max_features=max_features)

    model = LogisticRegression(max_iter=1000)
    for vname, vectorizer in vectorizers.items():
        x_train_vectorized = vectorizer.fit_transform(x_train)
        score = round(cross_val_score(model, x_train_vectorized, y_train).mean(), 4)
        print(vname, score)

evaluate_gensim_trigrams(x_train, y_train)

CountVectorizer 0.7896
TfidfVectorizer 0.8166


### Word2vec embeddings, gensim implementation

Create Word2Vec model and train it on our corpus

In [None]:
import gensim
from gensim.models.word2vec import Word2Vec

def create_word2vec_model(x):
    x = [o.split() for o in x]
    dim = 300
    model = Word2Vec(x, vector_size=dim, window=5, min_count=1)
    model.train(x, total_examples=len(x), epochs=10)
    return model

word2vec = create_word2vec_model(x_train)

Naive approach: vector representation of sentence is calculated by averaging over the vector representations of all its words.The better way is to feed obtained 3d-representation of the sentence to the neural net, it's to be implemented in another notebook

In [None]:
def vectorize_sentence(sentence):
    words = sentence.split()
    embeddings = [word2vec.wv[w] for w in words]
    return np.mean(embeddings, axis=0)

x_vectorized = np.array(list(map(vectorize_sentence, x_train)))
score = round(cross_val_score(model, x_vectorized, y_train).mean(), 4)
print(score)

0.8178


### Conclusion

The best vectorization model is TfidfVectorizer(ngram_range=(1, 1))

# Train model

Create pipeline with the best vectorizer and classifier. Evaluate it on the whole dataset using cross-validation

In [None]:
pline = Pipeline([('vectorizer', TfidfVectorizer(max_features=max_features)), 
                  ('classifier', model)])
# pline.fit(x_train, y_train)
print(round(cross_val_score(pline, x_train, y_train).mean(), 4))

0.8871
CPU times: user 33.7 s, sys: 6.28 s, total: 40 s
Wall time: 32.8 s


###Save model

Train pipeline on the whole dataset and save it for future use

In [None]:
import pickle
pline.fit(x_train, y_train)
with open("sentiment-classifier.pickle", "wb") as f:
    pickle.dump(pline, f)