In [2]:
from rmsle import rmsle

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.metrics import make_scorer, mean_squared_error
import xgboost as xgb
import lightgbm

In [3]:
train_df = pd.read_csv(r'data\train.csv')
test_df = pd.read_csv(r'data\Xtest.csv')

train_df.fillna('', inplace=True)
test_df.fillna('', inplace=True)

In [4]:
train_dict = train_df.set_index('Password')['Times'].to_dict()
test_list = list(test_df['Password'])

In [5]:
print(len(train_df), len(test_df))

4151496 1037875


In [6]:
def get_type(c):
    if c.isdigit():
        return 0
    elif c >= 'a' and c <= 'z':
        return 1
    elif c >= 'A' and c <= 'Z':
        return 2
    return 3

def get_year_position(s):
    for year in range(1900, 2019):
        index = s.find(str(year))
        if index != -1:
            return index
    return -1

def get_features(passwords):
    d = {'Len':[], 'IsDigit':[], 'DigitAmount':[], 'LowCharAmount':[], 'UpCharAmount':[], 'CoolCharAmount':[], 
         'ContDigitAmount':[], 'ContLowCharAmount':[], 'ContUpCharAmount':[], 'ContCoolCharAmount':[], 'ContainsYear':[],
        'IsFullDate':[], 'IsYearInTheEnd':[]}

    for i, elem in enumerate(passwords):
        ps = str(elem)
        d['Len'].append(len(ps))
        try:
            d['IsDigit'].append(int(str.isdigit(ps)))
        except:
            d['IsDigit'].append(0)
        
        all_amounts = [0] * 4
        cont_ams = [0] * 4
        cont_cur_ams = [0] * 4
        
        last_type = -1
        cur_type = -1
        
        for c in ps:
            cur_type = get_type(c)
            all_amounts[cur_type] += 1
            if last_type == -1:
                last_type = cur_type
                cont_cur_ams[cur_type] += 1
                continue
            if last_type == cur_type:
                cont_cur_ams[cur_type] += 1
                continue
            cont_ams[last_type] = max(cont_ams[last_type], cont_cur_ams[last_type])
            cont_cur_ams[last_type] = 0
            cont_cur_ams[cur_type] += 1
            last_type = cur_type
            
        for i in range(len(cont_ams)):
            cont_ams[i] = max(cont_ams[i], cont_cur_ams[i])
        
        amounts_titles = ['DigitAmount', 'LowCharAmount', 'UpCharAmount', 'CoolCharAmount']
        cont_am_titles = ['ContDigitAmount', 'ContLowCharAmount', 'ContUpCharAmount', 'ContCoolCharAmount']
        
        for i in range(len(all_amounts)):
            d[amounts_titles[i]].append(all_amounts[i])
            d[cont_am_titles[i]].append(cont_ams[i])
            
        year_position = get_year_position(ps)
        d['ContainsYear'].append(year_position != -1)
        d['IsYearInTheEnd'].append(year_position + 4 == len(ps))
        
        d['IsFullDate'].append(len(ps) == 8 and d['IsDigit'][-1] != 0 and d['ContainsYear'][-1] != 0)

    return pd.DataFrame.from_dict(d)

In [7]:
train_y = np.log(np.array(train_df['Times']) + 1)

In [8]:
train_list = train_df['Password']

train_x = get_features(train_list)

In [36]:
print(train_x[:10])

   Len  IsDigit  DigitAmount  LowCharAmount  UpCharAmount  CoolCharAmount  \
0   13        0            5              4             4               0   
1    8        0            3              0             5               0   
2    8        0            6              2             0               0   
3   10        1           10              0             0               0   
4    8        0            0              8             0               0   
5    7        0            6              1             0               0   
6    8        0            1              3             4               0   
7    7        0            4              3             0               0   
8    8        0            0              8             0               0   
9    9        0            8              1             0               0   

   ContDigitAmount  ContLowCharAmount  ContUpCharAmount  ContCoolCharAmount  \
0                3                  2                 1                  

In [10]:
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

elements_amount = 10000
r_state = 42

In [11]:
rfr_clf = RandomForestRegressor(n_estimators=100, random_state=r_state)

scores_rfr = cross_val_score(rfr_clf, train_x[:elements_amount], train_y[:elements_amount], 
                             cv=5, scoring=rmse_scorer, verbose=0)
print("rmse: %0.3f (+/- %0.3f)" % (scores_rfr.mean(), scores_rfr.std()))

rmse: -0.142 (+/- 0.014)


In [12]:
from sklearn import svm

svr_clf = svm.SVR(gamma='auto')

scores_svr = cross_val_score(svr_clf, train_x[:elements_amount], train_y[:elements_amount], 
                             cv=5, scoring=rmse_scorer, verbose=0)
print("rmse: %0.3f (+/- %0.3f)" % (scores_svr.mean(), scores_svr.std()))

rmse: -0.154 (+/- 0.017)


In [13]:
from sklearn import svm
from sklearn.linear_model import SGDRegressor

sgd_clf = SGDRegressor(max_iter=1000, tol=1e-3)

scores_sgd = cross_val_score(sgd_clf, train_x[:elements_amount], train_y[:elements_amount], cv=5, 
                             scoring=rmse_scorer, verbose=0)
print("rmse: %0.3f (+/- %0.3f)" % (scores_sgd.mean(), scores_sgd.std()))

rmse: -0.158 (+/- 0.017)


In [14]:
xgb_clf = xgb.XGBClassifier(n_estimators=100, random_state=r_state)

scores_xgb = cross_val_score(xgb_clf, train_x[:elements_amount], train_y[:elements_amount], cv=5, 
                             scoring=rmse_scorer, verbose=0)
print("rmse: %0.3f (+/- %0.3f)" % (scores_xgb.mean(), scores_xgb.std()))

rmse: -0.186 (+/- 0.021)


In [15]:
lgbm_clf = lightgbm.LGBMRegressor(n_estimators=100, random_state=r_state)

scores_lgbm = cross_val_score(lgbm_clf, train_x[:elements_amount], train_y[:elements_amount], cv=5,
                              scoring=rmse_scorer, verbose=0)
print("rmse: %0.3f (+/- %0.3f)" % (scores_lgbm.mean(), scores_lgbm.std()))

rmse: -0.141 (+/- 0.015)


In [16]:
test_list = test_df['Password']
    
test_x = get_features(test_list)

In [17]:
from sklearn import svm

#rf_clf = RandomForestClassifier(n_estimators=1000, n_jobs=6)
#rf_clf = RandomForestRegressor(n_estimators=1000)
#rf_clf = DecisionTreeClassifier()
#rf_clf = svm.LinearSVR(tol=1e-5)
#svr_clf = svm.SVR(gamma='auto')
#xgb_clf = xgb.XGBClassifier(n_estimators=100)
lgbm_clf = lightgbm.LGBMRegressor(n_estimators=100, random_state=r_state)

lgbm_clf.fit(train_x, train_y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [18]:
answer = np.exp(np.array(lgbm_clf.predict(test_x))) - 1

In [20]:
def improve_answer(arr, ps_list, tr_dict):
    am = 0
    diff_arr = [1, -1, 2, -2, 3, -3]
    for idx, ps in enumerate(ps_list):
        index = get_year_position(ps)
        if index == -1:
            continue
        
        nearest_values = []
        
        year = int(ps[index:index + 4])
        for diff in diff_arr:
            new_year = year + diff
            new_pass = '{0}{1}{2}'.format(ps[:index], str(new_year), ps[index + 4:])
            times = tr_dict.get(new_pass)
            if times != None:
                nearest_values.append(times)
        
        if len(nearest_values) != 0:
            arr[idx] = np.exp(np.sum(np.log(np.array(nearest_values) + 1)) / len(nearest_values)) - 1
            am += 1
    print(am)

In [21]:
improve_answer(answer, test_list, train_dict)

22517


In [22]:
answer_df = pd.DataFrame([range(len(answer)), answer]).T

In [24]:
answer_df.columns=['ID', 'Times']
answer_df = answer_df.astype({'ID': 'int32'})
answer_df.to_csv(r'predictions\lgbm_year_improved.csv', index=False)

In [25]:
from collections import Counter

c = Counter(train_y)

In [26]:
from pprint import pprint
pprint(c.most_common(10))

[(0.6931471805599453, 3547155),
 (1.0986122886681098, 309205),
 (1.3862943611198906, 96462),
 (1.6094379124341003, 47704),
 (1.791759469228055, 27903),
 (1.9459101490553132, 18627),
 (2.0794415416798357, 13635),
 (2.1972245773362196, 10567),
 (2.302585092994046, 8235),
 (2.3978952727983707, 6775)]


In [27]:
train_stat = []

for item in train_dict.items():
    train_stat.append(item)

In [28]:
train_stat.sort(key=lambda x:x[1], reverse=True)

In [29]:
from pprint import pprint

tmp_stat = list(filter(lambda x: len(x[0]) == 4 and str.isdigit(x[0]) and x[0] >= '1900' and x[0] <= '2020', train_stat))

pprint(tmp_stat)

[('2000', 1861),
 ('1990', 733),
 ('1991', 720),
 ('1992', 709),
 ('1989', 613),
 ('1988', 564),
 ('1993', 564),
 ('1987', 464),
 ('1986', 444),
 ('1984', 442),
 ('1969', 428),
 ('1980', 415),
 ('1979', 407),
 ('1994', 388),
 ('1977', 382),
 ('1981', 351),
 ('1974', 343),
 ('1975', 342),
 ('1966', 340),
 ('1982', 335),
 ('1978', 332),
 ('1976', 328),
 ('1968', 327),
 ('1973', 324),
 ('1972', 320),
 ('1970', 306),
 ('1967', 302),
 ('2001', 294),
 ('1971', 290),
 ('1963', 262),
 ('1995', 257),
 ('1965', 247),
 ('1961', 220),
 ('2002', 210),
 ('1999', 206),
 ('1960', 197),
 ('1996', 189),
 ('2020', 180),
 ('1962', 175),
 ('1959', 160),
 ('1957', 151),
 ('1955', 140),
 ('1998', 129),
 ('1956', 129),
 ('1954', 125),
 ('2003', 121),
 ('2004', 110),
 ('1953', 108),
 ('1997', 103),
 ('2010', 101),
 ('2005', 100),
 ('1945', 93),
 ('1947', 86),
 ('1948', 84),
 ('2008', 83),
 ('1949', 81),
 ('1951', 78),
 ('1941', 77),
 ('1946', 75),
 ('2009', 74),
 ('1942', 72),
 ('2006', 71),
 ('1950', 70),
 ('

In [30]:
test_stat = []

for item in test_list:
    test_stat.append(item)
    
tmp_test_stat = list(filter(lambda x: len(x) == 4 and str.isdigit(x) and x >= '1800' and x <= '2020', test_stat))

pprint(sorted(tmp_test_stat))

['1823',
 '1835',
 '1839',
 '1842',
 '1843',
 '1844',
 '1854',
 '1859',
 '1860',
 '1861',
 '1865',
 '1874',
 '1890',
 '1903',
 '1909',
 '1910',
 '1914',
 '1918',
 '1919',
 '1924',
 '1926',
 '1930',
 '1939',
 '1940',
 '1952',
 '1958',
 '1964',
 '1983',
 '1985',
 '2018']
