In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import pysrt
import re
from sklearn.utils.class_weight import compute_class_weight

from gensim.models import Word2Vec
from torchtext.data import get_tokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import catboost

from sklearn.model_selection import cross_val_predict

from sklearn.metrics import classification_report, accuracy_score, mean_squared_error

In [2]:
HTML = "<.*?>"
TAG = "{.*?}"
LETTERS = "[^a-zA-Z\'.,!? ]"
TOKENIZER = get_tokenizer('spacy')
EMBEDDINGS = Word2Vec.load("word2vec_literature.model", mmap='r').wv
N_SPLITS = 20

In [3]:
def clean_subs(subs):
    cleaned_subtitles = []
    for sub in subs:
        txt = re.sub(HTML, ' ', sub)
        txt = re.sub(TAG, ' ', txt)
        txt = re.sub(LETTERS, ' ', txt)
        cleaned_subtitles.append(' '.join(txt.lower().split()))
    return cleaned_subtitles

In [4]:
def tokenize(texts, tokenizer=TOKENIZER):
    tokenized_text = []
    for text in texts:
        tokenized_text.append(tokenizer(text))
    return tokenized_text

In [5]:
def vectorize(texts, embeddings=EMBEDDINGS):
    embeded_texts = np.zeros(300)
    for text in texts:
        text_embed = np.zeros(300)
        for word in text:
            if word in embeddings:
                text_embed = text_embed + embeddings[word]
        embeded_texts = np.vstack((embeded_texts, text_embed))
    return embeded_texts[1:]

In [6]:
def split_subs(matrix, n_splits=N_SPLITS):
    splitted_subs = np.zeros(301)
    for row in matrix:
        a = vectorize(np.array_split(row[1], n_splits))
        b = (row[0] * np.ones(n_splits)).reshape(-1, 1)
        splitted_subs = np.vstack((splitted_subs, np.hstack((a, b))))
    return splitted_subs[1:]

In [7]:
def fit_predict(X_y_train, X_y_test, model):
    X_train = X_y_train[:, :-1]
    y_train = X_y_train[:, -1]
    X_test = X_y_test[:, :-1]
    y_test = X_y_test[:, -1]
    
    model.fit(X_train, y_train)
    print('Accuracy:', accuracy_score(y_test, model.predict(X_test)))
    return model.predict_proba(X_test).flatten()

In [8]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,level,subtitles
0,1,Hello.\nMy name's Forrest. Forrest Gump.\nDo y...
1,0,[Music playing]\nAdvertise your product or bra...
2,0,Created and Encoded by -- Bokutox -- of www....
3,0,"Adrian?\nCome on.\n<i>Zeus, I'm sorry.\nI can'..."
4,0,"October is inventory time,\nso right now, Stat..."


In [9]:
%%time
data['subtitles'] = clean_subs(data['subtitles'].values)
data.head()

Wall time: 473 ms


Unnamed: 0,level,subtitles
0,1,hello. my name's forrest. forrest gump. do you...
1,0,music playing advertise your product or brand ...
2,0,created and encoded by bokutox of www.yify tor...
3,0,"adrian? come on. zeus, i'm sorry. i can't take..."
4,0,"october is inventory time, so right now, statl..."


In [10]:
%%time
data['subtitles'] = tokenize(data['subtitles'].values)
data.head()

Wall time: 14 s


Unnamed: 0,level,subtitles
0,1,"[hello, ., my, name, 's, forrest, ., forrest, ..."
1,0,"[music, playing, advertise, your, product, or,..."
2,0,"[created, and, encoded, by, bokutox, of, www.y..."
3,0,"[adrian, ?, come, on, ., zeus, ,, i, 'm, sorry..."
4,0,"[october, is, inventory, time, ,, so, right, n..."


In [11]:
class_weights = compute_class_weight('balanced',
                                     classes=data['level'].unique(),
                                     y=data['level'])
class_weights = {k: v for k, v in zip(data['level'].unique(), class_weights)}
class_weights

{1: 1.2098214285714286,
 0: 2.1171875,
 2: 0.477112676056338,
 3: 1.6524390243902438}

In [12]:
log_reg = LogisticRegression(max_iter=1000,
                             class_weight='balanced',
                             n_jobs=-1,
                             random_state=45)
rf = RandomForestClassifier(n_estimators=500,
                            max_depth=8,
                            min_samples_split=20,
                            class_weight='balanced',
                            n_jobs=-1,
                            random_state=45)
cb = catboost.CatBoostClassifier(iterations=3000,
                                 depth=3,
                                 task_type='GPU',
                                 class_weights=class_weights,
                                 verbose=0)

In [13]:
# о-о-о-о-чень долго (4,5 часа в Kaggle ноутбуке с GPU P100)
# LOO
# first_lvl_result_matrix = np.empty((1, N_SPLITS*3*4))
# for i in data.index:
#     train = data.drop(i).values
#     train_splitted = split_subs(train)
    
#     test = data.iloc[i, :].values
#     test_splitted = split_subs(test.reshape(1, -1))
    
#     preds = np.empty((1,))
#     preds = np.hstack((preds,
#                       fit_predict(train_splitted,
#                                   test_splitted,
#                                   log_reg)))
#     preds = np.hstack((preds,
#                       fit_predict(train_splitted,
#                                   test_splitted,
#                                   rf)))
#     preds = np.hstack((preds,
#                       fit_predict(train_splitted,
#                                   test_splitted,
#                                   cb)))
#     first_lvl_result_matrix = np.vstack((first_lvl_result_matrix, preds[1:]))
#     print(f'Got {i+1} movies in probs matrix.')

# tmp = np.hstack((first_lvl_result_matrix[1:], data['level'].values.reshape(-1, 1)))
# pd.DataFrame(tmp).to_csv('probs_data.csv', index=False)

data_lvl_2 = pd.read_csv('probs_data.csv')
data_lvl_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,231,232,233,234,235,236,237,238,239,240
0,0.183423,0.397999,0.230217,0.188361,0.472414,0.326689,0.118336,0.082562,0.675336,0.245267,...,0.04236,0.475526,0.280189,0.193366,0.050919,0.44341,0.217275,0.257893,0.081422,1.0
1,0.118254,0.797294,0.081139,0.003312,0.083693,0.838492,0.063007,0.014808,0.02332,0.837934,...,0.126507,0.223469,0.233668,0.419495,0.123368,0.280812,0.279509,0.431991,0.007688,0.0
2,0.248967,0.403072,0.264538,0.083422,0.042433,0.31505,0.417394,0.225124,0.2961,0.608946,...,0.13931,0.196999,0.272871,0.437618,0.092513,0.108728,0.067886,0.71831,0.105076,0.0
3,0.531802,0.33737,0.097458,0.03337,0.036697,0.079475,0.381619,0.502209,0.12565,0.080758,...,0.035498,0.037848,0.030219,0.634646,0.297286,0.049126,0.023834,0.487288,0.439753,0.0
4,0.069862,0.727621,0.118399,0.084119,0.428497,0.229014,0.130995,0.211494,0.070738,0.21288,...,0.01563,0.48425,0.185694,0.242417,0.08764,0.511443,0.176416,0.128165,0.183977,0.0


In [14]:
final_log_reg = LogisticRegression(class_weight='balanced',
                                   n_jobs=-1,
                                   random_state=45)

In [15]:
print(classification_report(data_lvl_2.iloc[:, -1],
                         cross_val_predict(final_log_reg,
                                           data_lvl_2.iloc[:, :-1],
                                           data_lvl_2.iloc[:, -1], cv=20)))

              precision    recall  f1-score   support

         0.0       0.42      0.47      0.44        32
         1.0       0.45      0.54      0.49        56
         2.0       0.78      0.65      0.71       142
         3.0       0.58      0.68      0.63        41

    accuracy                           0.61       271
   macro avg       0.56      0.59      0.57       271
weighted avg       0.64      0.61      0.62       271



In [16]:
print('RMSE:')
print(mean_squared_error(data_lvl_2.iloc[:, -1],
                         cross_val_predict(final_log_reg,
                                           data_lvl_2.iloc[:, :-1],
                                           data_lvl_2.iloc[:, -1], cv=20)) ** 0.5)

RMSE:
0.8460893325598584
