In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import os
import tqdm
import string

from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from lightgbm import LGBMClassifier

Data extraction

In [2]:
os.chdir('./data')

In [39]:
files = np.sort(np.array(os.listdir()))
files

array(['10-million-password-list-top-100.txt',
       '10-million-password-list-top-1000.txt',
       '10-million-password-list-top-10000.txt',
       '10-million-password-list-top-100000.txt',
       '10-million-password-list-top-1000000.txt',
       '10-million-password-list-top-500.txt', '10k-most-common.txt',
       '500-passwords-worst.txt', 'Submissions', 'Xtest.csv.zip',
       'common-passwords-win.txt', 'rockyou-withcount.txt', 'rockyou.csv',
       'sample_submission.csv.zip', 'train.csv.zip', 'words.txt'],
      dtype='<U40')

In [4]:
train = pd.read_csv('train.csv.zip')
test = pd.read_csv('Xtest.csv.zip')

Functions for features creation

In [5]:
def add_top_features(data, names):
    output = data.copy()
    for name in tqdm.tqdm_notebook(names):
        f_name = 'top_'+ name[:-4].split('-')[-1]
        datka = pd.read_table(name, header=None, names=['Password', f_name])
        datka[f_name] = 1
        output = pd.merge(output, datka, left_on='Password', right_on='Password', how = 'left').fillna(0)
    return output

def all_features(data):
    features = []
    for word in tqdm.tqdm_notebook(data.Password.fillna(0).values):
        features.append(features_create(word))
    features = np.array(features)
    data['N_letter'], data['UpLetter'], data['DownLetter'], data['Num'] = features[:, 0], features[:, 1], features[:, 2], features[:, 3] 
    return data

def features_create(word):
    features = [[],[],[],[]]
    features[0] = len(word)
    up = 0
    lo = 0
    nu = 0
    for symb in word:
        if symb.isnumeric():
            nu += 1
        elif symb.isupper():
            up += 1
        else:
            lo += 1
    features[1] = up
    features[2] = lo
    features[3] = nu
    return features

symbols = string.printable[:-6]
def num_of_symbols(data):
    output = data.copy()
    features = np.zeros((len(symbols), output.shape[0]))
    words = output.Password.values
    for i in tqdm.tqdm_notebook(range(len(words))):
        word = words[i]
        for symb in word:
            try:
                features[symbols.find(symb)][i] += 1
            except BaseException:
                pass
    for symb in symbols:
        output[symb] = features[symbols.find(symb)]
    return output

def add_words_features(data):
    output = data.copy()
    f_name = 'words'
    datka = pd.read_table('words.txt', header=None, names=['Password', f_name])
    datka[f_name] = 1
    output = pd.merge(output, datka, left_on='Password', right_on='Password', how = 'left').fillna(0)
    f_name = 'win'
    datka = pd.read_table('common-passwords-win.txt', header=None, names=['Password', f_name])
    datka[f_name] = 1
    output = pd.merge(output, datka, left_on='Password', right_on='Password', how = 'left').fillna(0)
    return output

def add_top_position(data):
    output = data.copy()
    f_name = 'top_position'
    datka = pd.read_table('10-million-password-list-top-1000000.txt', header=None, names=['Password', f_name])
    datka[f_name] = np.arange(datka.shape[0])
    output = pd.merge(output, datka, left_on='Password', right_on='Password', how = 'left').fillna(0)
    return output

def add_rock_position(data):
    output = data.copy()
    f_name = 'rock_position'
    datka = pd.read_csv('rockyou.csv', header=None, names=['Password', f_name])
    datka[f_name] = np.arange(datka.shape[0])
    output = pd.merge(output, datka, left_on='Password', right_on='Password', how = 'left').fillna(0)
    return output

def add_rock_times(data):
    output = data.copy()
    f_name = 'rock_times'
    datka = pd.read_csv('rockyou.csv', header=None, names=['Password', f_name])
    output = pd.merge(output, datka, left_on='Password', right_on='Password', how = 'left').fillna(0)
    return output

Features creation

In [11]:
%%time
X = add_top_features(train.fillna('0'), files[:8])
X = all_features(X)
X = num_of_symbols(X)
X = add_words_features(X)
X = add_top_position(X)
X = add_rock_position(X)
X = add_rock_times(X)

y = np.log(X.Times.values)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4151496), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4151496), HTML(value='')))


CPU times: user 3min 18s, sys: 1min 12s, total: 4min 30s
Wall time: 4min 30s


Model control

In [15]:
y_class = np.zeros(X.shape[0])
y_class[X.Times.values == 1] = 1

In [16]:
%%time
classifier = LGBMClassifier(random_state=47)
classifier.fit(X.drop(['Password', 'Times'], axis=1), y_class)

CPU times: user 4min 19s, sys: 15.3 s, total: 4min 34s
Wall time: 48.2 s


In [17]:
X['just_one'] = classifier.predict(X.drop(['Password', 'Times'], axis=1))
proba = classifier.predict_proba(X.drop(['Password', 'Times', 'just_one'], axis=1))
X['one'] = proba[:, 1]
X['not_one'] = proba[:, 0]

In [None]:
X['value'] = 55893 * (1 - X['top_position'].values  / 1000000.)
X['value'][X['value'] == 55893] = 0
X['value'][X.Password == '123456'] = 55893

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(['Password', 'Times'], axis=1), y, random_state=47)

In [None]:
model = LGBMRegressor(random_state=47, metric='rmse')
model.fit(X_train, y_train, eval_set=[(np.array(X_test), np.array(y_test))], eval_metric='rmse')

In [None]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [None]:
pred = np.exp(model.predict(X_test))
pred[pred < 4] = np.round(pred[pred < 4])

In [None]:
rmsle(np.exp(y_test), pred)

Prediction

In [18]:
%%time
y = np.log(X.Times.values)
model = LGBMRegressor(random_state=47, metric='rmse')
model.fit(X.drop(['Password', 'Times'], axis=1), y, eval_metric='rmse')

CPU times: user 3min 35s, sys: 19.7 s, total: 3min 55s
Wall time: 49.2 s


In [21]:
%%time
X_control = add_top_features(test.fillna('0'), files[:8])
X_control = all_features(X_control)
X_control = num_of_symbols(X_control)
X_control = add_words_features(X_control)
X_control = add_top_position(X_control)
X_control = add_rock_position(X_control)
X_control = add_rock_times(X_control)
X_control['just_one'] = classifier.predict(X_control.drop(['Id', 'Password'], axis=1))
proba_control = classifier.predict_proba(X_control.drop(['Id', 'Password', 'just_one'], axis=1))
X_control['one'] = proba_control[:, 1]
X_control['not_one'] = proba_control[:, 0]
X_control['value'] = 55893 * (1 - X_control['top_position'].values  / 1000000.)
X_control['value'][X_control['value'] == 55893] = 0
X_control['value'][X_control.Password == '123456'] = 55893

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1037875), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1037875), HTML(value='')))


CPU times: user 1min 39s, sys: 27.1 s, total: 2min 6s
Wall time: 1min 51s


In [37]:
prediction = np.exp(model.predict(X_control.drop_duplicates(subset='Password').drop(['Password', 'Id'], axis=1)))
prediction[prediction < 4] = np.round(prediction[prediction < 4])
pd.DataFrame(prediction, index=range(test.shape[0]), columns=['Times']).to_csv('lgbm_submit13.csv', index_label='Id')