### 6 февраля, подбор параметров для случайного леса

### Загрузка и обработка данных

In [30]:
import pandas as pd
path_to_data = './data/data_with_features'

In [72]:
from utils.make_vectoizer import make_vectorizer
from utils.io_custom import dump_pickle_object


train = pd.read_csv('./data/clear_data/train_data.csv')
new_vectorizer = make_vectorizer(train_data=train,
                                 ngram_range=(3,4),
                                 max_features=4000,
                                 analyzer='char_wb')
dump_pickle_object('./data/redir/vectorizer.obj', new_vectorizer)

In [73]:
from utils.feature_generation import get_data_with_feature


train = get_data_with_feature(train, './data/redir')
train['cosine_dist'] = train['cosine_dist'].fillna(train['cosine_dist'].mean())

In [81]:
validate = pd.read_csv('./data/clear_data/validate_data.csv')
validate = get_data_with_feature(validate, './data/redir')
validate['cosine_dist'] = validate['cosine_dist'].fillna(validate['cosine_dist'].mean())

X_validate = validate.drop(columns=['is_redirect'])
y_validate = validate['is_redirect']

In [112]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    train.drop(columns=['is_redirect']),
    train['is_redirect'],
    stratify=train['is_redirect'],
    random_state=0,
    shuffle=True,
    test_size=0.2
)

### Подбор параметров для случайного леса

In [113]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [120]:
base_rf = RandomForestClassifier(random_state=1)

parameters = {
    'n_estimators': range(10, 100, 10),
    'max_depth': range(1, 12),
    'min_samples_leaf': range(1, 7),
    'min_samples_split': range(2, 9),
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced', 'balanced_subsample']
}

grid_searh_cv_clf = GridSearchCV(base_rf,
                                 parametrs,
                                 cv=5)

grid_searh_cv_clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=1),
             param_grid={'max_depth': [2, 5, 7, 10],
                         'n_estimators': [10, 20, 30]})

In [121]:
grid_searh_cv_clf.best_estimator_

RandomForestClassifier(max_depth=10, n_estimators=30, random_state=1)

In [122]:
best_rf = grid_searh_cv_clf.best_estimator_

### Топ 5 используемых признаков

In [123]:
# важность фич (как часто данная фича использовалась и давала максимальное снижеие неопределенности)
feature_importances_df = pd.DataFrame({
        'features': list(X_train),
        'feature_importances': best_rf.feature_importances_
}).sort_values('feature_importances', ascending=False)

feature_importances_df[:5]

Unnamed: 0,features,feature_importances
14,cosine_dist,0.281431
13,lev_dist,0.223942
4,how_match_brands_name_in_query,0.119665
0,len_of_query,0.059715
2,len_of_category,0.043659


### Метрики для обучающей выборки

In [134]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, confusion_matrix


predicts_for_train = best_rf.predict(X_train)
predicts_for_test = best_rf.predict(X_test)

print("RF from grid searcher cv:")
print("\nMetrics for train:")
for metric in [roc_auc_score, f1_score, accuracy_score]:
    print(metric.__name__, round(metric(y_train, predicts_for_train), 4))

print("\nMetrics for test:")
for metric in [roc_auc_score, f1_score, accuracy_score]:
    print(metric.__name__, round(metric(y_test, predicts_for_test), 4))

RF from grid searcher cv:

Metrics for train:
roc_auc_score 0.9728
f1_score 0.9714
accuracy_score 0.9769

Metrics for test:
roc_auc_score 0.9637
f1_score 0.9603
accuracy_score 0.9678


### Метрики для валидационной выборки

In [136]:
predicts_for_validate = best_rf.predict(X_validate)

print("\nMetrics for validate with default cutoff:")
for metric in [roc_auc_score, f1_score, accuracy_score]:
    print(metric.__name__, round(metric(y_validate, predicts_for_validate), 4))
    
print('Confusion matrix for validate:')
print(confusion_matrix(y_validate, predicts_for_validate))


Metrics for validate with default cutoff:
roc_auc_score 0.5743
f1_score 0.6054
accuracy_score 0.5085
Confusion matrix for validate:
[[ 31 109]
 [  7  89]]


In [211]:
class_probabilities = best_rf.predict_proba(X_validate)
custom_cutoff = 0.5
predicts_with_custom_cutoff = []

for zero_prob, one_prob in class_probabilities:
        predicts_with_custom_cutoff.append(1 if one_prob > custom_cutoff else 0)

In [212]:
print("\nMetrics for validate with custom cutoff:")
for metric in [roc_auc_score, f1_score, accuracy_score]:
    print(metric.__name__, round(metric(y_validate, predicts_with_custom_cutoff), 4))
    
print('Confusion matrix for validate:')
print(confusion_matrix(y_validate, predicts_with_custom_cutoff))


Metrics for validate with custom cutoff: 0.5
roc_auc_score 0.5743
f1_score 0.6054
accuracy_score 0.5085
Confusion matrix for validate:
[[ 31 109]
 [  7  89]]


### Подбор cutoff

In [226]:
import numpy as np


class_probabilities = best_rf.predict_proba(X_validate)
all_f1_scores = []
all_cutoffs = list(np.arange(0, 1, 0.0001))

for current_cutoff in all_cutoffs:
    predicts_with_current_cutoff = []

    for zero_prob, one_prob in class_probabilities:
        predicts_with_current_cutoff.append(1 if one_prob > current_cutoff else 0)

    all_f1_scores.append(f1_score(y_validate, np.array(predicts_with_current_cutoff)))
    
max_f1_score = max(all_f1_scores)
cutoff_for_max_f1_score = all_cutoffs[np.argmax(all_f1_scores)]

print("Max f1_score is:", max_f1_score)
print("Cutoff is:", cutoff_for_max_f1_score)

Max f1_score is: 0.6164874551971327
Cutoff is: 0.6


### Метрики для валидационной выборки

In [229]:
predicts_with_custom_cutoff = []

for zero_prob, one_prob in class_probabilities:
        predicts_with_custom_cutoff.append(1 if one_prob > cutoff_for_max_f1_score else 0)

print('\nMetrics for validate with custom cutoff:', cutoff_for_max_f1_score)
for metric in [roc_auc_score, f1_score, accuracy_score]:
    print(metric.__name__, round(metric(y_validate, predicts_with_custom_cutoff), 4))
    
print('Confusion matrix for validate with custom cutoff:', cutoff_for_max_f1_score)
print(confusion_matrix(y_validate, predicts_with_custom_cutoff))


Metrics for validate with custom cutoff: 0.6
roc_auc_score 0.6015
f1_score 0.6165
accuracy_score 0.5466
Confusion matrix for validate with custom cutoff: 0.6
[[43 97]
 [10 86]]
