In [13]:
import gensim
import numpy as np
import pandas as pd
import warnings
import wget
import zipfile

# custom scripts
from data_preparation import get_X_y_type, get_X_for_rules
from model_fitting import *

from sklearn.externals import joblib
from sklearn.metrics import f1_score
from sklearn.svm import SVC

warnings.simplefilter("ignore")

Preparing semantic model:

In [2]:
with zipfile.ZipFile("./data/models/180.zip", "r") as archive:
    stream = archive.open("model.bin")
    model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

most_similar_entrance = set([similar[0] for similar in model.most_similar("входить_VERB")])
most_similar_exit = set([similar[0] for similar in model.most_similar("уходить_VERB")])

def vec_similarity(v1, v2):
    v1_norm = gensim.matutils.unitvec(np.array(v1).astype(float))
    v2_norm = gensim.matutils.unitvec(np.array(v2).astype(float))
    return np.dot(v1_norm, v2_norm)

def get_w2v_vectors(text, model):
    total_counter = 0
    total_vector = np.zeros(300)
    for word in text:
        try:
            vector = np.array(model.wv[word])
            total_vector += vector
            total_counter += 1
        except:
            continue
    res_vector = total_vector / total_counter
    return res_vector

Decision function:

In [3]:
def assign_tei_type(direction):
    tei_type = ""
    verbs_list = [item for item in direction if item.endswith("VERB")]
    for verb in verbs_list:
        # case 1 -- verb in most_common for a type
        # in this case, we return the type immediately
        if verb in most_similar_entrance:
            tei_type = "entrance"
            return tei_type
        elif verb in most_similar_exit:
            tei_type = "exit"
            return tei_type
        # case 2 -- verb is unknown
        else:
            direction_vector = get_w2v_vectors(verbs_list, model)
            similarity_entrance = vec_similarity(direction_vector, model.wv["войти_VERB"])
            similarity_exit = vec_similarity(direction_vector, model.wv["уходить_VERB"])
            if similarity_entrance > similarity_exit:
                tei_type = "entrance"
            else:
                tei_type = "exit"
    return tei_type

Classificator:

In [4]:
def rule_based_classificator(X_list, goal_label):
    y_list = []
    for X in X_list:
        label = assign_tei_type(X)
        if label == goal_label:
            y_list.append(1)
        else:
            y_list.append(0)
    return np.array(y_list)

Running classificator on all subsets:

In [5]:
X_train, y_train = get_X_y_type("entrance", "train")
X_valid, y_valid = get_X_y_type("entrance", "val")
X_test, y_test = get_X_y_type("entrance", "test")

In [8]:
X_train = get_X_for_rules("entrance", "train")
y_pred = rule_based_classificator(X_train, "entrance")
f1_train = f1_score(y_pred, y_train)
print("score for train is {:.6f}".format(f1_train))

score for train is 0.320973


In [9]:
X_valid = get_X_for_rules("entrance", "val")
y_pred = rule_based_classificator(X_valid, "entrance")
f1_valid = f1_score(y_pred, y_valid)
print("score for validation is {:.6f}".format(f1_valid))

score for validation is 0.380682


In [10]:
X_train, y_train = get_X_y_type("exit", "train")
X_valid, y_valid = get_X_y_type("exit", "val")
X_test, y_test = get_X_y_type("exit", "test")

In [11]:
X_train = get_X_for_rules("exit", "train")
y_pred = rule_based_classificator(X_train, "exit")
f1_train = f1_score(y_pred, y_train)
print("score for exit train is {:.6f}".format(f1_train))

score for exit train is 0.309677


In [12]:
X_valid = get_X_for_rules("exit", "val")
y_pred = rule_based_classificator(X_valid, "exit")
f1_valid = f1_score(y_pred, y_valid)
print("score for exit  validation is {:.6f}".format(f1_valid))

score for exit  validation is 0.311512


# 1 &emsp; Entrance

In [15]:
X_train, y_train = get_X_y_type("entrance", "train")
X_valid, y_valid = get_X_y_type("entrance", "val")
X_test, y_test = get_X_y_type("entrance", "test")

Fitted ML models:

In [8]:
entrance_dict, fitted_models = models_for_type(X_train, y_train, 
                                              X_valid, y_valid, 
                                              X_test, y_test)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.5min finished


Model LogReg scored 0.675830 on cross-validation with params:
{'C': 1.1090909090909091}
Model LogReg scored 0.834008 on validation set
Fitting 5 folds for each of 198 candidates, totalling 990 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 990 out of 990 | elapsed:  6.6min finished


Model Decision Tree scored 0.635451 on cross-validation with params:
{'criterion': 'gini', 'max_depth': 12}
Model Decision Tree scored 0.888000 on validation set
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.3min finished


Model Random Forest scored 0.648480 on cross-validation with params:
{'n_estimators': 41}
Model Random Forest scored 0.926070 on validation set


In [9]:
X_test = get_X_for_rules("entrance")
y_rules = rule_based_classificator(X_test, "entrance")
rules_score = f1_score(y_rules, y_test)

In [17]:
entrance_dict["model"].append("rule")

entrance_dict["cross-val"].append(0)

y_valid_rules = rule_based_classificator(X_valid, "entrance")
valid_score = f1_score(y_valid_rules, y_valid)
entrance_dict["validation"].append(0)

entrance_dict["test"].append(rules_score)

NameError: name 'entrance_dict' is not defined

In [None]:
entrance_df = pd.DataFrame.from_dict(entrance_dict)

In [21]:
X_train, y_train = get_X_y_type("entrance", "train")
X_valid, y_valid = get_X_y_type("entrance", "val")
X_test, y_test = get_X_y_type("entrance", "test")

In [8]:
X_train = get_X_for_rules("entrance", "train")
y_pred = rule_based_classificator(X_train, "entrance")
f1_train = f1_score(y_pred, y_train)
print("score for train is {:.6f}".format(f1_train))

score for train is 0.320973


In [9]:
X_valid = get_X_for_rules("entrance", "val")
y_pred = rule_based_classificator(X_valid, "entrance")
f1_valid = f1_score(y_pred, y_valid)
print("score for validation is {:.6f}".format(f1_valid))

score for validation is 0.380682


In [None]:
best_model = fitted_models[best_model_name]
joblib.dump(best_model, "./data/models/entrance_final.pkl");

## 2 &emsp; Exit

In [13]:
X_train, y_train = get_X_y_type("exit", "train")
X_valid, y_valid = get_X_y_type("exit", "val")
X_test, y_test = get_X_y_type("exit", "test")

In [14]:
exit_dict, fitted_models = models_for_type(X_train, y_train, 
                                              X_valid, y_valid, 
                                              X_test, y_test)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.5min finished


Model LogReg scored 0.707451 on cross-validation with params:
{'C': 0.1}
Model LogReg scored 0.753846 on validation set
Fitting 5 folds for each of 198 candidates, totalling 990 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 990 out of 990 | elapsed:  4.8min finished


Model Decision Tree scored 0.695571 on cross-validation with params:
{'criterion': 'gini', 'max_depth': 6}
Model Decision Tree scored 0.822695 on validation set
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   39.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.1min finished


Model Random Forest scored 0.717889 on cross-validation with params:
{'n_estimators': 73}
Model Random Forest scored 0.966887 on validation set


In [15]:
X_test = get_X_for_rules("exit", "test")
y_test = rule_based_classificator(X_test, "exit")
test_score = f1_score(y_rules, y_test)

X_test = get_X_for_rules("exit")
y_test = rule_based_classificator(X_test, "exit")
test_score = f1_score(y_rules, y_test)

In [16]:
exit_dict["model"].append("rule")
exit_dict["cross-val"].append(test_score)
exit_dict["validation"].append(0)
exit_dict["test"].append(rules_score)

In [17]:
exit_df = pd.DataFrame.from_dict(exit_dict)
exit_df.head()

Unnamed: 0,model,cross-val,validation,test
0,LogReg,0.707451,0.753846,0.725552
1,Decision Tree,0.695571,0.822695,0.713846
2,Random Forest,0.717889,0.966887,0.710526
3,rule,0.0,0.0,0.284533


In [19]:
best_model_name = exit_df.iloc[exit_df["test"].argmax()]["model"]
print(best_model_name)

LogReg


In [None]:
best_model = fitted_models[best_model_name]
joblib.dump(best_model, "./data/models/entrance_final.pkl");