In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegressionCV
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from time import time
import lightgbm as lgb
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

seed = 10

#### Загрузка датасета

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/Xtest.csv')
sample = pd.read_csv('data/sample_submission.csv')

In [3]:
train['Password'] = train['Password'].astype(str)
test['Password'] = test['Password'].astype(str)

#### Генерация признаков в трейне

In [4]:
train['len'] = train['Password'].map(lambda x: len(x))
train['cnt_let'] = train['Password'].map(lambda x: sum(c.isalpha() for c in x))
train['cnt_dig'] = train['Password'].map(lambda x: sum(c.isdigit() for c in x))
train['cnt_other'] = train['len'] - train['cnt_let'] - train['cnt_dig']
train['cnt_up'] = train['Password'].map(lambda x: sum(c.isupper() for c in x))
train['cnt_low'] = train['Password'].map(lambda x: sum(c.islower() for c in x))

train['123456'] = train['Password'].map(lambda x: 1 if x.find('123456') != -1 else 0)
train['654321'] = train['Password'].map(lambda x: 1 if x.find('654321') != -1 else 0)
train['qwerty'] = train['Password'].map(lambda x: 1 if x.find('qwerty') != -1 else 0)
train['ytrewq'] = train['Password'].map(lambda x: 1 if x.find('ytrewq') != -1 else 0)
train['word'] = train['Password'].map(lambda x: 1 if wordnet.synsets(x) else 0)
train['1symb'] = train['Password'].map(lambda x: 1 if x.count(x[0]) == len(x) else 0)
train['asdf'] = train['Password'].map(lambda x: 1 if x.find('asdf') != -1 else 0)
train['fdsa'] = train['Password'].map(lambda x: 1 if x.find('fdsa') != -1 else 0)
train['zxcv'] = train['Password'].map(lambda x: 1 if x.find('zxcv') != -1 else 0)
train['vcxz'] = train['Password'].map(lambda x: 1 if x.find('vcxz') != -1 else 0)
train['qaz'] = train['Password'].map(lambda x: 1 if x.find('qaz') != -1 else 0)
train['zaq'] = train['Password'].map(lambda x: 1 if x.find('zaq') != -1 else 0)
train['wsx'] = train['Password'].map(lambda x: 1 if x.find('wsx') != -1 else 0)
train['xsw'] = train['Password'].map(lambda x: 1 if x.find('xsw') != -1 else 0)
train['edc'] = train['Password'].map(lambda x: 1 if x.find('edc') != -1 else 0)
train['cde'] = train['Password'].map(lambda x: 1 if x.find('cde') != -1 else 0)

#### Генерация признаков в тесте

In [5]:
test['len'] = test['Password'].map(lambda x: len(x))
test['cnt_let'] = test['Password'].map(lambda x: sum(c.isalpha() for c in x))
test['cnt_dig'] = test['Password'].map(lambda x: sum(c.isdigit() for c in x))
test['cnt_other'] = test['len'] - test['cnt_let'] - test['cnt_dig']
test['cnt_up'] = test['Password'].map(lambda x: sum(c.isupper() for c in x))
test['cnt_low'] = test['Password'].map(lambda x: sum(c.islower() for c in x))

test['123456'] = test['Password'].map(lambda x: 1 if x.find('123456') != -1 else 0)
test['654321'] = test['Password'].map(lambda x: 1 if x.find('654321') != -1 else 0)
test['qwerty'] = test['Password'].map(lambda x: 1 if x.find('qwerty') != -1 else 0)
test['ytrewq'] = test['Password'].map(lambda x: 1 if x.find('ytrewq') != -1 else 0)
test['word'] = test['Password'].map(lambda x: 1 if wordnet.synsets(x) else 0)
test['1symb'] = test['Password'].map(lambda x: 1 if x.count(x[0]) == len(x) else 0)
test['asdf'] = test['Password'].map(lambda x: 1 if x.find('asdf') != -1 else 0)
test['fdsa'] = test['Password'].map(lambda x: 1 if x.find('fdsa') != -1 else 0)
test['zxcv'] = test['Password'].map(lambda x: 1 if x.find('zxcv') != -1 else 0)
test['vcxz'] = test['Password'].map(lambda x: 1 if x.find('vcxz') != -1 else 0)
test['qaz'] = test['Password'].map(lambda x: 1 if x.find('qaz') != -1 else 0)
test['zaq'] = test['Password'].map(lambda x: 1 if x.find('zaq') != -1 else 0)
test['wsx'] = test['Password'].map(lambda x: 1 if x.find('wsx') != -1 else 0)
test['xsw'] = test['Password'].map(lambda x: 1 if x.find('xsw') != -1 else 0)
test['edc'] = test['Password'].map(lambda x: 1 if x.find('edc') != -1 else 0)
test['cde'] = test['Password'].map(lambda x: 1 if x.find('cde') != -1 else 0)

#### Разделим выборку на трейн, валидацию и тест

In [6]:
X_test = test.drop(['Id', 'Password'], axis=1)
X = train.drop(['Times', 'Password'], axis=1)
y = train['Times']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True)
y_train_log, y_val_log = np.log1p(y_train), np.log1p(y_val)

### Случайный лес

In [7]:
%%time 
tree = RandomForestRegressor(random_state=seed, n_jobs=-1)
#tree_params = {'n_estimators': list(range(100,200,100))}
tree_params = {'n_estimators': [10]}

grid = GridSearchCV(tree, tree_params, scoring='neg_mean_squared_error', n_jobs=-1, cv=3)
grid.fit(X_train, y_train_log)

print('лучшие параметры:', grid.best_params_)
print('mae на трейне:', np.sqrt(mean_squared_error(y_train_log, grid.best_estimator_.predict(X_train))))
print('mae на внутренней проверке:', grid.best_score_*(-1))
print('mae на валидации:', np.sqrt(mean_squared_error(y_val_log, grid.best_estimator_.predict(X_val))))

лучшие параметры: {'n_estimators': 10}
mae на трейне: 0.3752669802696074
mae на внутренней проверке: 0.1412571907211412
mae на валидации: 0.3762676395826177
Wall time: 1min 38s


#### Предикт

In [8]:
y_test_log = grid.best_estimator_.predict(X_test)
y_test = np.expm1(y_test_log)

sample['Times'] = y_test
sample.to_csv('rf_log_submit.csv', index=False) 