In [56]:
import pandas as pd
import os
import numpy as np
import lightgbm as lgb

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from math import sqrt
import math
import collections
import warnings
warnings.filterwarnings("ignore")

###Обьединив train и test мы сможем точнее понять взаимосвязь с исходной выборкой и для удобства обработки (не надо повторять действия отдельно для train, отдельно для test)

In [18]:
#Считываем тренировочную и тестовую выборку
train = pd.read_csv('train.csv', sep = ',')
test = pd.read_csv('Xtest.csv', sep = ',')
top_1_billion = pd.read_csv('10-million-password-list-top-1000000.txt', header = None, names = ['Password'])

train['Password'] = train['Password'].astype(str)
test['Password'] = test['Password'].astype(str)

In [19]:
train.head()

Unnamed: 0,Password,Times
0,631XniVx2lS5I,2
1,LEGIT747,1
2,742364es,1
3,3846696477,1
4,laurahop,2


In [20]:
test.head()

Unnamed: 0,Id,Password
0,0,ThaisCunha
1,1,697775113
2,2,922a16922a
3,3,andy74
4,4,joemack


In [21]:
#обьединяем данные
data = pd.concat([train, test], axis = 0, ignore_index = True)
del train, test
data.head()

Unnamed: 0,Id,Password,Times
0,,631XniVx2lS5I,2.0
1,,LEGIT747,1.0
2,,742364es,1.0
3,,3846696477,1.0
4,,laurahop,2.0


In [8]:
top_1_billion.head()

Unnamed: 0,Password
0,123456
1,password
2,12345678
3,qwerty
4,123456789


In [22]:
top_1_billion['N'] = top_1_billion.index
result = pd.merge(data, top_1_billion, on ='Password', how = 'left')
result = result.sort_values(by = 'Times', ascending=False).reset_index(drop = True)
result.head()

Unnamed: 0,Id,Password,Times,N
0,,123456,55893.0,0.0
1,,qwerty,13137.0,3.0
2,,123456789,11696.0,4.0
3,,12345,10938.0,5.0
4,,1234,6432.0,6.0


In [23]:
#Посмотрим на 15 первых значений, для которых не подтянулись значения из top_1_billion
result[result['N'].isnull()][0:15]

Unnamed: 0,Id,Password,Times,N
23142,,,27.0,
238747,,sou,3.0,
274270,,,3.0,
355485,,tenho,2.0,
377915,,HPP187,2.0,
482954,,heins,2.0,
514818,,de,2.0,
580596,,15306012,2.0,
604341,,relgiez1,1.0,
604342,,794156,1.0,


### Замечаем, что:
   1. Только 8 значений имеют частоту больше 1, которых нет в топ_1_миллион, поэтому будем считать их выбросами и удалим их
   2. Для значений с times == 1, присвоим N любое число, главное, чтобы оно было больше N* = max(N) for times == 2 

In [30]:
#удаляем выбросы
list_index_del = result[result['N'].isnull()][0:8].index.tolist()
result.drop(list_index_del, axis=0, inplace = True)

In [29]:
#Проверим пароли на дубли
df = result['Password'].duplicated()
df[df == True]

Series([], Name: Password, dtype: bool)

In [36]:
#2. find max(N) where times == 2
max(result['N'][result.Times == 2])

755987.0

In [37]:
result['N'].fillna(1000000, inplace = True)

### Прогнозная модель

In [180]:
#Разделим обратно на train и test
train = result[result['Id'].isnull()]
train.drop('Id', axis = 1, inplace = True)
test = result[result['Id'].notnull()].reset_index(drop = True)

In [55]:
train_X = train[train.columns.difference(['Password','Times'])]
train_Y = train['Times']

X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, train_size = 0.8, random_state = 10)

In [227]:
params = {
    'random_sate':1,
    
    'metric':'rmse',
    'objective':'regression'
}

In [209]:
train_lgb = lgb.Dataset(X_train, np.log1p(y_train))
test_lgb = lgb.Dataset(X_test, np.log1p(y_test))

In [210]:
res =  {}
res = lgb.train(
    params=params,
    train_set=train_lgb,
    valid_sets=[train_lgb,test_lgb],
    valid_names=['train','test'],
    num_boost_round=100,
    evals_result=res,
    verbose_eval=10,
)

[10]	train's rmse: 0.0661174	test's rmse: 0.0652335
[20]	train's rmse: 0.0493309	test's rmse: 0.0490401
[30]	train's rmse: 0.0490969	test's rmse: 0.0488978
[40]	train's rmse: 0.0490941	test's rmse: 0.0489049
[50]	train's rmse: 0.049094	test's rmse: 0.048906
[60]	train's rmse: 0.049094	test's rmse: 0.0489061
[70]	train's rmse: 0.049094	test's rmse: 0.0489061
[80]	train's rmse: 0.049094	test's rmse: 0.0489061
[90]	train's rmse: 0.049094	test's rmse: 0.0489061
[100]	train's rmse: 0.049094	test's rmse: 0.0489061


In [228]:
cv_results = lgb.cv(
        params,
        train_lgb,
        num_boost_round=100,
        nfold = 5,
        metrics='rmse',
        #early_stopping_rounds=10,
        verbose_eval= 10,
        # This is what I added
        stratified = False
        )

[10]	cv_agg's rmse: 0.150015 + 0.000800643
[20]	cv_agg's rmse: 0.0696589 + 0.00115508
[30]	cv_agg's rmse: 0.0520205 + 0.00126105
[40]	cv_agg's rmse: 0.0494504 + 0.00122133
[50]	cv_agg's rmse: 0.0491294 + 0.00119269
[60]	cv_agg's rmse: 0.0490905 + 0.00118093
[70]	cv_agg's rmse: 0.0490858 + 0.00117662
[80]	cv_agg's rmse: 0.0490853 + 0.00117509
[90]	cv_agg's rmse: 0.0490852 + 0.00117455
[100]	cv_agg's rmse: 0.0490852 + 0.00117437


In [254]:
def rmsle(y_test, y_pred): 
    assert len(y_test) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))

In [221]:
rmsle_train = rmsle(y_train, np.expm1(model.predict(X_train)))
rmsle_test = rmsle(y_test, np.expm1(model.predict(X_test)))

rmsle_train, rmsle_test

(0.021170928034277281, 0.021159615664498956)

In [168]:
#test.drop(['Password', 'Times'], axis = 1, inplace = True)

In [181]:
test.head()

Unnamed: 0,Id,Password,Times,N
0,0.0,ThaisCunha,,1000000.0
1,1.0,697775113,,1000000.0
2,2.0,922a16922a,,1000000.0
3,3.0,andy74,,1000000.0
4,4.0,joemack,,298373.0


In [171]:
predict = np.expm1(model.predict(test[X_train.columns]))
results = pd.concat([test['Id'], pd.DataFrame(predict)], names = ['Id', 'N'],  axis = 1, ignore_index= True)
rename_columns = ['Id', 'Times']
results.columns = rename_columns
results.to_csv("password2_sub_1.csv", sep=',', index = False )

In [258]:
test.head()

Unnamed: 0,Id,Password,Times,N
0,0.0,ThaisCunha,,1000000.0
1,1.0,697775113,,1000000.0
2,2.0,922a16922a,,1000000.0
3,3.0,andy74,,1000000.0
4,4.0,joemack,,298373.0


In [188]:
predict = np.expm1(model.predict(test[X_train.columns]))
results = pd.concat([test, pd.DataFrame(predict)], axis = 1, ignore_index= True)

In [190]:
col = ['Id', 'Password', 'Times', 'N', 'predict']

In [250]:
results.columns = col
results.sort_values(by = 'N').head()

Unnamed: 0,Id,Password,Times,N,predict
259430,259430.0,password,,1.0,235.060166
1025249,1025249.0,12345678,,2.0,235.060166
266444,266444.0,monkey,,14.0,235.060166
244023,244023.0,696969,,16.0,235.060166
142507,142507.0,123321,,21.0,235.060166


In [282]:
#введем cv как более точную оценку прогноза модели
from sklearn.metrics import make_scorer
rmsle_scorer = make_scorer(rmsle, greater_is_better = False)

def rmsle_cross_valid (model):
    scores = cross_val_score(model, train_X, np.log1p(train_Y), cv = 5, scoring = rmsle_scorer)
    return abs(scores)

In [None]:
#Random_Forest
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=4, n_estimators=50, min_samples_leaf = 1, random_state = 1)
model.fit(X_train, np.log1p(y_train)) # обучение

rmse_train = rmsle(y_train, np.around(np.expm1(model.predict(X_train))))
rmse_test = rmsle(y_test, np.around(np.expm1(model.predict(X_test))))

print ('RMSLE при обычном split на train: {:.5f}'.format(rmse_train))
print ('RMSLE при обычном split на test: {:.5f}'.format(rmse_test))
#print ('Значения RMSLE на cross-validation: {}'.format(rmsle_cross_valid(model)))
print ('Среднее значение (RMSLE) на cross-validation: {:.5f}'.format(rmsle_cross_valid(model).mean()))

RMSLE при обычном split на train: 0.00054
RMSLE при обычном split на test: 0.00073


In [285]:
predict = np.expm1(model.predict(test[X_train.columns]))
results = pd.concat([test['Id'].astype(int), np.around(pd.DataFrame(predict))], names = ['Id', 'N'],  axis = 1, ignore_index= True)
rename_columns = ['Id', 'Times']
results.columns = rename_columns
results.to_csv("password2_sub_2_RF.csv", sep=',', index = False )

In [281]:
train[0:15]

Unnamed: 0,Password,Times,N
0,123456,55893.0,0.0
1,qwerty,13137.0,3.0
2,123456789,11696.0,4.0
3,12345,10938.0,5.0
4,1234,6432.0,6.0
5,111111,5682.0,7.0
6,1234567,4796.0,8.0
7,dragon,3927.0,9.0
8,123123,3845.0,10.0
9,baseball,3565.0,11.0


In [278]:
predict = np.expm1(model.predict(test[X_train.columns]))
results = pd.concat([test, pd.DataFrame(predict)], axis = 1, ignore_index= True)
col = ['Id', 'Password', 'Times', 'N', 'predict']
results.columns = col
results.sort_values(by = 'N')[0:50]

Unnamed: 0,Id,Password,Times,N,predict
259430,259430.0,password,,1.0,22686.552771
1025249,1025249.0,12345678,,2.0,22686.552771
266444,266444.0,monkey,,14.0,3376.673282
244023,244023.0,696969,,16.0,3084.905089
142507,142507.0,123321,,21.0,2848.711266
88263,88263.0,1234567890,,23.0,2761.555739
222568,222568.0,michael,,24.0,2678.584254
1036336,1036336.0,superman,,27.0,2568.768838
410501,410501.0,000000,,32.0,2324.967802
874816,874816.0,trustno1,,36.0,2225.15682
