In [1]:
import numpy as np
import pandas as pd

import datetime as dt
from datetime import date

import string
import re

import matplotlib.pyplot as plt
import seaborn as sns

import sys
import warnings

import lightgbm as lgb
import shap

from sklearn.metrics import mean_squared_error, mean_squared_log_error
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
df_train_full = pd.read_csv('train.csv')
df_test = pd.read_csv('Xtest.csv')

In [3]:
# оставляем семпл из трейна (так как комп не тянет больше) - сохраняем баланс 1/не 1

df_train_full = pd.concat([df_train_full[df_train_full['Times'] == 1].sample(frac = 0.7, random_state = 7).copy(), 
                      df_train_full[(df_train_full['Times'] > 1)].sample(frac = 0.7, random_state = 7).copy()])

df_train_full = df_train_full.sample(frac = 1, random_state = 7)

## Feature Engineering

In [4]:
# считаем количество символов

for df_ in [df_train_full, df_test]:
    df_['len'] = df_['Password'].str.len()

In [5]:
# добавляем индикаторы только нижний/верхний регистр и т.д.

for df_ in [df_train_full, df_test]:
    df_['lower'] = (df_['Password'].str.lower() == df_['Password']).astype(int)
    df_['upper'] = (df_['Password'].str.upper() == df_['Password']).astype(int)
    df_['alpha'] = (df_['Password'].str.isalpha() == True).astype(int)
    df_['isnumeric'] = (df_['Password'].str.isnumeric() == True).astype(int)
    df_['isalnum'] = (df_['Password'].str.isalnum() == True).astype(int)
    df_['title'] = (df_['Password'].str.istitle() == True).astype(int)

In [6]:
# cчитаем количество различных символов в пароле

def string_vectorizer(strng, characters=(string.ascii_lowercase + string.digits + string.punctuation)):
    vector = [strng.lower().count(char) for char in characters] 
    return vector

for df_ in [df_train_full, df_test]:
    for ch in list(string.ascii_lowercase + string.digits + string.punctuation):
        df_[ch] = np.NaN
    df_[list(string.ascii_lowercase + string.digits + string.punctuation)] = [string_vectorizer(str(df_['Password'].iloc[i])) for i in range(len(df_))]   

In [7]:
# считаем число уникальных символов и всех символов разных видов и некоторые логичные отношения

for df_ in [df_train_full, df_test]:
    df_['len_set'] = (df_[list(string.ascii_lowercase + string.digits + string.punctuation)] > 0).astype(int).sum(axis = 1)
    df_['len_set_ch'] = (df_[list(string.ascii_lowercase)] > 0).astype(int).sum(axis = 1)
    df_['len_set_dig'] = (df_[list(string.digits)] > 0).astype(int).sum(axis = 1)
    df_['len_set_punc'] = (df_[list(string.punctuation)] > 0).astype(int).sum(axis = 1)
    
    df_['len_ch'] = df_[list(string.ascii_lowercase)].sum(axis = 1)
    df_['len_dig'] = df_[list(string.digits)].sum(axis = 1)
    df_['len_punc'] = df_[list(string.punctuation)].sum(axis = 1)
    
    df_['ratio_unique'] = df_['len_set']/df_['len']
    
    df_['ratio_unique_ch'] = df_['len_set_ch']/df_['len_set']
    df_['ratio_unique_dig'] = df_['len_set_dig']/df_['len_set']
    df_['ratio_unique_punc'] = df_['len_set_punc']/df_['len_set']
    
    df_['ratio_ch'] = df_['len_ch']/df_['len']
    df_['ratio_dig'] = df_['len_dig']/df_['len']
    df_['ratio_punc'] = df_['len_punc']/df_['len']

In [9]:
# смотрим наличие некоторых специальных комбинаций

for df_ in [df_train_full, df_test]:
    for password in ['123', '111', '234', '987', '432']:
        df_[password] = ((df_['Password'].str.contains(password)) == True).astype(int)
for df_ in [df_train_full, df_test]:
    for password in ['qwe', 'qaz', 'pass', 'abc', 'asd']:
        df_[password] = ((df_['Password'].str.lower().str.contains(password)) == True).astype(int)

In [10]:
# создаем список фичей для модели

list_feat = list(df_train_full.columns[2:])
print(list_feat, len(list_feat))

['len', 'lower', 'upper', 'alpha', 'isnumeric', 'isalnum', 'title', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', 'len_set', 'len_set_ch', 'len_set_dig', 'len_set_punc', 'len_ch', 'len_dig', 'len_punc', 'ratio_unique', 'ratio_unique_ch', 'ratio_unique_dig', 'ratio_unique_punc', 'ratio_ch', 'ratio_dig', 'ratio_punc', '123', '111', '234', '987', '432', 'qwe', 'qaz', 'pass', 'abc', 'asd'] 99


## Cross-Validation

In [12]:
# делаем преобразование над таргетом

df_train_full['target'] = np.log(df_train_full['Times'] + 1)

In [13]:
# разбиваем на трейн и валид

df_train, df_valid = df_train_full[:int(0.8*len(df_train_full))], df_train_full[int(0.8*len(df_train_full)):]

print('Train: ', len(df_train), '\nValid: ', len(df_valid),'\nTest: ',  len(df_test))

Train:  2324837 
Valid:  581210 
Test:  1037875


In [14]:
# пишем нужную метрику

def rmsle(y_test, predictions):
    return np.sqrt(mean_squared_log_error(y_test, predictions))

def rmse(y_test, predictions):
    return np.sqrt(mean_squared_error(y_test, predictions))

In [15]:
# подбираем гиперпараметры

def objective(params):
    params = {
        'num_leaves': int(params['num_leaves']),
        'learning_rate' : params['learning_rate'],
        'min_child_samples' : int(params['min_child_samples']),
        'feature_fraction' : params['feature_fraction'],
        'bagging_fraction' : params['bagging_fraction'],
        'lambda_l2' : params['lambda_l2'],
        'lambda_l1' : params['lambda_l1'],
        'n_estimators' :  int(params['n_estimators'])
    }    
    gbm = lgb.LGBMRegressor(
        min_data_per_group = 2,
        reg_sqrt = True,
        num_threads = 8,
        objective = 'mse',
        bagging_freq = 5,
        boost_from_average = True,
        seed = 42,
        **params
    )
    
    score = rmse(df_train['target'], cross_val_predict(gbm, df_train[list_feat].values, 
                                                                        (df_train['target']), 
                                                                        cv = KFold(n_splits = 3, 
                                                                        random_state = 18, 
                                                                        shuffle=True)))

    print("RMSLE {:.3f} params {}".format(score, params))
    return score

space = {
    'n_estimators' : hp.quniform('n_estimators', 10, 500, 1),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.8, 0.001),
    'min_child_samples' : hp.quniform('min_child_samples', 1, 20, 1),
    'num_leaves' : hp.quniform('num_leaves', 8, 128, 1),
    'feature_fraction' : hp.quniform('feature_fraction', 0.05, 1.0, 0.05),
    'bagging_fraction' : hp.quniform('bagging_fraction', 0.1, 1.0, 0.05),
    'lambda_l2' : hp.quniform('lambda_l2', 0.001, 1.5, 0.001),
    'lambda_l1' : hp.quniform('lambda_l1', 0.001, 1.5, 0.001)
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals = 30)

RMSLE 0.377 params {'num_leaves': 32, 'learning_rate': 0.438, 'min_child_samples': 12, 'feature_fraction': 0.65, 'bagging_fraction': 0.8500000000000001, 'lambda_l2': 1.167, 'lambda_l1': 0.97, 'n_estimators': 44}
RMSLE 0.375 params {'num_leaves': 106, 'learning_rate': 0.766, 'min_child_samples': 19, 'feature_fraction': 0.75, 'bagging_fraction': 0.65, 'lambda_l2': 0.907, 'lambda_l1': 0.752, 'n_estimators': 182}
RMSLE 0.377 params {'num_leaves': 74, 'learning_rate': 0.023, 'min_child_samples': 12, 'feature_fraction': 0.35000000000000003, 'bagging_fraction': 0.6000000000000001, 'lambda_l2': 1.046, 'lambda_l1': 1.318, 'n_estimators': 444}
RMSLE 0.373 params {'num_leaves': 50, 'learning_rate': 0.456, 'min_child_samples': 17, 'feature_fraction': 0.75, 'bagging_fraction': 1.0, 'lambda_l2': 1.143, 'lambda_l1': 0.9430000000000001, 'n_estimators': 273}
RMSLE 0.374 params {'num_leaves': 89, 'learning_rate': 0.47900000000000004, 'min_child_samples': 17, 'feature_fraction': 0.5, 'bagging_fraction': 

In [16]:
print("Hyperopt estimated optimum {}".format(best))

best = {
        'num_leaves': int(best['num_leaves']),
        'learning_rate' : best['learning_rate'],
        'min_child_samples' : int(best['min_child_samples']),
        'feature_fraction' : best['feature_fraction'],
        'bagging_fraction' : best['bagging_fraction'],
        'lambda_l2' : best['lambda_l2'],
        'lambda_l1' : best['lambda_l1'],
        'n_estimators' :  int(best['n_estimators'])
    }    

gbm2 = lgb.LGBMRegressor(
        min_data_per_group = 2,
        reg_sqrt = True,
        num_threads = 8,
        objective = 'mse',
        bagging_freq = 5,
        boost_from_average = True,
        seed = 42,
        **best
    )

Hyperopt estimated optimum {'bagging_fraction': 0.9, 'feature_fraction': 0.65, 'lambda_l1': 1.053, 'lambda_l2': 0.627, 'learning_rate': 0.362, 'min_child_samples': 14.0, 'n_estimators': 443.0, 'num_leaves': 123.0}


In [17]:
# проверяем точность на валидации

fitted = gbm2.fit(df_train[list_feat].values, df_train['target'])
df_valid['predict'] = np.clip(np.exp(fitted.predict(df_valid[list_feat].values)) - 1, 1, np.inf)
rmsle_valid = np.round(rmsle(df_valid['Times'], df_valid['predict']), 4)                           
print(rmsle_valid.mean())

0.3687


## Prediction

In [18]:
# делаем прогноз и пишем в файл

gbm2.fit(df_train_full[list_feat].values, (df_train_full['target']))
df_test['Times'] = np.clip(np.exp(gbm2.predict(df_test[list_feat])) - 1, 1, np.inf)
df_test[['Id', 'Times']].to_csv('sumbit_sport_6_'+str(date.today())+'_'+str(rmsle_valid)+'.csv', index = False)

## Best Models Blend

In [None]:
#df1 = pd.read_csv('sumbit_sport_6_2019-10-06_0.3748951815393266.csv')
#df2 = pd.read_csv('sumbit_sport_5_2019-10-05_0.37305721354473675.csv')
#df3 = pd.read_csv('sumbit_sport_4_2019-10-05_0.3733657824992114.csv')
#df_concat = pd.concat((df1, df2, df3))
#by_row_index = df_concat.groupby(df_concat.index)
#df_means = by_row_index.mean()
#df_means[['Id', 'Times']].to_csv('submit_sport_mean_5.csv', index=False)