### 5 февраля, случайный лес, обучение и валидация на новых файлах

## Обучение

In [12]:
import pandas as pd
from utils.feature_generation import get_data_with_feature

In [13]:
path_to_data = "./data/redir"

In [14]:
train_data = pd.read_csv(path_to_data + '/better_train_new.csv')

In [15]:
train_data = train_data.drop(columns=["Unnamed: 4", "lev"])
train_data = train_data.query("is_redirect in ['0', '1']")
# костыль с именем категории
train_data['category_id'] = 0
train_data['is_redirect'] = train_data['is_redirect'].map({'1': 1, '0': 0})

In [16]:
train_data.head(3)

Unnamed: 0,query,category_name,is_redirect,category_id
0,консилер clarins instant,MJ Care,0,0
1,палетка теней dior,Тени для век,1,0
2,Armand Basi,Armand Basi,1,0


In [17]:
train_data = get_data_with_feature(train_data, path_to_data)

TypeError: get_relative_depth() missing 1 required positional argument: 'tree'

In [None]:
train_data.head(3)

In [None]:
train_data.columns

In [8]:
X = train_data.drop(columns='is_redirect')
y = train_data['is_redirect']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,
    random_state=0,
    shuffle=True,
    test_size=0.2
)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline


def make_features_pipeline(n_jobs):
    pipeline = Pipeline([
        ('clf', RandomForestClassifier(n_jobs=n_jobs, random_state=1))
    ])
    return pipeline

In [10]:
from pipeline.training import SCORING, STRONG_CV
from pipeline.training import fit_pipeline, cross_validate_pipeline


N_JOBS = 1
N_JOBS_CV = 4

pipeline = make_features_pipeline(N_JOBS)

result = cross_validate_pipeline(
    pipeline,
    X_train,
    y_train,
    cv=STRONG_CV,
    scoring=SCORING,
    n_jobs=N_JOBS_CV,
    verbose=1,
)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    7.1s finished


In [11]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', RandomForestClassifier(n_jobs=1, random_state=1))])

In [21]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, confusion_matrix


print("RF from pipeline:")

print("\nMetrics for train:")
for metric in [roc_auc_score, f1_score, accuracy_score]:
    print(metric.__name__, round(metric(y_train, pipeline.predict(X_train)), 4))

print("\nMetrics for test:")
for metric in [roc_auc_score, f1_score, accuracy_score]:
    print(metric.__name__, round(metric(y_test, pipeline.predict(X_test)), 4))

RF from pipeline:

Metrics for train:
roc_auc_score 0.9917
f1_score 0.9902
accuracy_score 0.9919

Metrics for test:
roc_auc_score 0.9054
f1_score 0.8901
accuracy_score 0.9104


In [26]:
print('Confusion matrix for train:')
print(confusion_matrix(y_train, pipeline.predict(X_train)))

Confusion matrix for train:
[[1582   11]
 [  11 1117]]


In [27]:
print('Confusion matrix for test:')
print(confusion_matrix(y_test, pipeline.predict(X_test)))

Confusion matrix for test:
[[373  26]
 [ 35 247]]


## Валидация

In [13]:
validate_data = pd.read_csv(path_to_data + '/better_validation.csv')

In [14]:
validate_data

Unnamed: 0,query,category_name,category_id,is_redirect
0,колготки,колготки и носки,3245756,1
1,бады,БАДы,3390215,1
2,бад,БАДы,3390215,1
3,для машины,для посудомоечной машины,3245766,1
4,нишевая,нишевая парфюмерия,3115485,1
...,...,...,...,...
231,колготки 20 den,колготки и носки,3245756,0
232,маска для лица mixit,маски,873249,0
233,маска для лица aha,маски,873249,0
234,тушь для ресниц кабаре,тушь для ресниц,873823,0


In [15]:
def preprocess_validate_data(data):
    data = data.query("is_redirect in ['0', '1']")
    data = data[~data['category_id'].isna()]
    data['category_id'] = data['category_id'].apply(int)
    data['query'] = data['query'].apply(str)
    data['category_name'] = data['category_name'].apply(str)

    return data

In [16]:
validate_data = preprocess_validate_data(validate_data)

In [17]:
validate_data = get_data_with_feature(validate_data, path_to_data)

In [18]:
X_validate = validate_data.drop(columns=['is_redirect'])
y_validate = validate_data['is_redirect']

In [19]:
print("\nMetrics for validate:")
for metric in [roc_auc_score, f1_score, accuracy_score]:
    print(metric.__name__, round(metric(y_validate, pipeline.predict(X_validate)), 4))


Metrics for validate:
roc_auc_score 0.4449
f1_score 0.1471
accuracy_score 0.5085


In [28]:
print('Confusion matrix for validate:')
print(confusion_matrix(y_validate, pipeline.predict(X_validate)))

Confusion matrix for validate:
[[110  30]
 [ 86  10]]
