In [7]:
from datasets import load_dataset
import pandas as pd

# Data loading
ds = load_dataset("stanfordnlp/imdb")
df_train = pd.DataFrame(ds['train'])
df_test = pd.DataFrame(ds['test'])

# Data preprocessing
X_train = df_train['text']
y_train = df_train['label']

X_test = df_test['text']
y_test = df_test['label']


In [8]:
X_train.head(), y_train.head(), X_test.head(), y_test.head()

(0    I rented I AM CURIOUS-YELLOW from my video sto...
 1    "I Am Curious: Yellow" is a risible and preten...
 2    If only to avoid making this type of film in t...
 3    This film was probably inspired by Godard's Ma...
 4    Oh, brother...after hearing about this ridicul...
 Name: text, dtype: object,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: label, dtype: int64,
 0    I love sci-fi and am willing to put up with a ...
 1    Worth the entertainment value of a rental, esp...
 2    its a totally average film with a few semi-alr...
 3    STAR RATING: ***** Saturday Night **** Friday ...
 4    First off let me say, If you haven't enjoyed a...
 Name: text, dtype: object,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: label, dtype: int64)

# TF-IDF

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = [1000, 5000]
ngram_range = [(1,1), (1,2), (1,3)]

tfidfs = []

for max_feature in max_features:
    for ngram in ngram_range:
        tfidf = TfidfVectorizer(max_features=max_feature, ngram_range=ngram)
        X_train_tfidf = tfidf.fit_transform(X_train)
        X_test_tfidf = tfidf.transform(X_test)
        tfid_dict = {
            'max_feature': max_feature,
            'ngram': ngram,
            'X_train_tfidf': X_train_tfidf,
            'X_test_tfidf': X_test_tfidf
        }
        tfidfs.append(tfid_dict)

# W2V

In [11]:
from gensim.models import Word2Vec  
import nltk

In [34]:
import numpy as np

def get_average_word2vec(tokens, model, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word].reshape((1, size))
            count += 1
    if count != 0:
        vec /= count
    return vec

In [32]:
from gensim.models import Word2Vec
from sklearn.preprocessing import scale

# Tokenization
X_train_tokens = [text.split() for text in X_train]
X_test_tokens = [text.split() for text in X_test]

# Model Word2Vec
vector_sizes = [50, 100]
windows = [5, 10]
min_counts = [1, 2]

w2v_models = []

for vector_size in vector_sizes:
    for window in windows:
        for min_count in min_counts:
            w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=vector_size, window=window, min_count=min_count, workers=4)
            X_train_w2v = np.array([get_average_word2vec(tokens, w2v_model, vector_size) for tokens in X_train_tokens])
            X_test_w2v = np.array([get_average_word2vec(tokens, w2v_model, vector_size) for tokens in X_test_tokens])
            w2v_dict = {
                'vector_size': vector_size,
                'window': window,
                'min_count': min_count,
                'X_train_w2v': X_train_w2v,
                'X_test_w2v': X_test_w2v
            }
            w2v_models.append(w2v_dict)

# FastText

In [15]:
from gensim.models import FastText

# Model FastText
fasttext_model = FastText(sentences=X_train_tokens, vector_size=100, window=5, min_count=2, workers=4)

# Uśrednianie wektorów dla FastText
X_train_fasttext = np.array([get_average_word2vec(tokens, fasttext_model, 100) for tokens in X_train_tokens])
X_test_fasttext = np.array([get_average_word2vec(tokens, fasttext_model, 100) for tokens in X_test_tokens])


In [31]:
fasttexts = [{
    'X_train_fasttext': X_train_fasttext,
    'X_test_fasttext': X_test_fasttext
}]

# Simple Modelling

In [35]:
from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
 
param_grid_lgbm = {
    'num_leaves': [31, 50],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
}

param_grid_xgb = {
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200],
}

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
}

param_grid_catboost = {
    'iterations': [100, 200],
    'learning_rate': [0.05, 0.1],
    'depth': [3, 5],
}

models = [
    {
        'model': LGBMClassifier(),
        'param_grid': param_grid_lgbm
    },
    {
        'model': xgb.XGBClassifier(),
        'param_grid': param_grid_xgb
    },
    {
        'model': RandomForestClassifier(),
        'param_grid': param_grid_rf
    },
    {
        'model': CatBoostClassifier(),
        'param_grid': param_grid_catboost
    }
]

In [49]:
fasttexts[0]['X_train_fasttext'].shape

(25000, 1, 100)

In [50]:
from tqdm import tqdm

df_scores = pd.DataFrame(columns=['model', 'accuracy', 'precision', 'recall', 'f1'])

for model_dict in tqdm(models[:2]):
    for tfidf_dict in tqdm(tfidfs[:1]):
        model = model_dict['model']
        param_grid = model_dict['param_grid']
        X_train_tfidf = tfidf_dict['X_train_tfidf']
        X_test_tfidf = tfidf_dict['X_test_tfidf']
        model_name = model.__class__.__name__ + ' ' + str(tfidf_dict['max_feature']) + ' ' + str(tfidf_dict['ngram'])
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
        grid_search.fit(X_train_tfidf, y_train)
        y_pred = grid_search.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        df_scores = pd.concat([df_scores, pd.DataFrame([{
            'model': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }])], ignore_index=True)
        df_scores.to_csv('scores_non_transformer.csv', index=False)
        
    for w2v_dict in tqdm(w2v_models[:1]):
        model = model_dict['model']
        param_grid = model_dict['param_grid']
        X_train_w2v = w2v_dict['X_train_w2v']
        X_train_w2v = X_train_w2v.reshape(-1, 50)
        X_test_w2v = w2v_dict['X_test_w2v']
        X_test_w2v = X_test_w2v.reshape(-1, 50)
        model_name = model.__class__.__name__ + ' ' + str(w2v_dict['vector_size']) + ' ' + str(w2v_dict['window']) + ' ' + str(w2v_dict['min_count'])
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
        grid_search.fit(X_train_w2v, y_train)
        y_pred = grid_search.predict(X_test_w2v)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        df_scores = pd.concat([df_scores, pd.DataFrame([{
            'model': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }])], ignore_index=True)
        df_scores.to_csv('scores_non_transformer.csv', index=False)
        
    for fasttext_dict in tqdm(fasttexts[:1]):
        model = model_dict['model']
        param_grid = model_dict['param_grid']
        X_train_fasttext = fasttext_dict['X_train_fasttext']
        X_train_fasttext = X_train_fasttext.reshape(-1, 100)
        X_test_fasttext = fasttext_dict['X_test_fasttext']
        X_test_fasttext = X_test_fasttext.reshape(-1, 100)
        model_name = model.__class__.__name__ + ' FastText'
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
        grid_search.fit(X_train_fasttext, y_train)
        y_pred = grid_search.predict(X_test_fasttext)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        df_scores = pd.concat([df_scores, pd.DataFrame([{
            'model': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }])], ignore_index=True)
        df_scores.to_csv('scores_non_transformer.csv', index=False)

df_scores.to_csv('scores_non_transformer.csv', index=False)



  0%|          | 0/2 [00:00<?, ?it/s]

[A[A

[A[A

100%|██████████| 1/1 [08:09<00:00, 489.21s/it]


[A[A

100%|██████████| 1/1 [00:59<00:00, 59.87s/it]


[A[A

100%|██████████| 1/1 [01:50<00:00, 110.65s/it]

[A

[A[A

100%|██████████| 1/1 [24:35<00:00, 1475.71s/it]


[A[A

100%|██████████| 1/1 [07:34<00:00, 454.61s/it]


[A[A

100%|██████████| 1/1 [14:38<00:00, 878.61s/it]

100%|██████████| 2/2 [57:48<00:00, 1734.34s/it]


In [None]:
df_scores