## Import Standard Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import os
import pickle
import matplotlib

## Import Kaggle API's

In [None]:
from __future__ import absolute_import
from kaggle.api.kaggle_api_extended import KaggleApi
from kaggle.api_client import ApiClient
api = KaggleApi(ApiClient())
api.authenticate()

## Notebook constants

In [3]:
competition = 'dmia-sport-2019-fall-intro'
train_file = os.path.join('..','input',competition,'train.csv')
test_file = os.path.join('..','input',competition,'Xtest.csv')
split_seed = 7
split_part =0.3
model_seed = 8


## Load files

In [4]:
train = pd.read_csv(train_file, dtype={'Password': str, 'Times': np.int32})
x_test = pd.read_csv(test_file, index_col=0, dtype={'Password': str, 'Id': np.int32})

  mask |= (ar1 == a)


In [5]:
x_train = pd.DataFrame(train['Password'])
y = np.log(train['Times']+1)

In [6]:
y.hist(bins=100);

## Feature generation

Keyboard Ditance

In [7]:
SHIFT_COST = 3.0
NONE_COST = 10

qwertyKeyboardArray = [
    ['`','1','2','3','4','5','6','7','8','9','0','-','='],
    ['q','w','e','r','t','y','u','i','o','p','[',']','\\'],
    ['a','s','d','f','g','h','j','k','l',';','\''],
    ['z','x','c','v','b','n','m',',','.','/'],
    ['', '', ' ', ' ', ' ', ' ', ' ', '', '']
    ]

qwertyShiftedKeyboardArray = [
    ['~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '+'],
    ['Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', '{', '}', '|'],
    ['A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', ':', '"'],
    ['Z', 'X', 'C', 'V', 'B', 'N', 'M', '<', '>', '?'],
    ['', '', ' ', ' ', ' ', ' ', ' ', '', '']
    ]

keyboardArray = qwertyKeyboardArray
shiftedKeyboardArray = qwertyShiftedKeyboardArray

def arrayForChar(c):
    if (True in [c in r for r in keyboardArray]):
        return keyboardArray
    elif (True in [c in r for r in shiftedKeyboardArray]):
        return shiftedKeyboardArray
    else:
        return None


def getCharacterCoord(c, array=qwertyKeyboardArray):
    row = -1
    column = -1
    for r in array:
        if c in r:
            row = array.index(r)
            column = r.index(c)
            return (row, column)
    return None
    
def euclideanKeyboardDistance(c1, c2):
    array1, array2 = arrayForChar(c1), arrayForChar(c2)
    if array1 is None or array2 is None:
        return NONE_COST
    coord1 = getCharacterCoord(c1, arrayForChar(c1))
    coord2 = getCharacterCoord(c2, arrayForChar(c2))
    return ((coord1[0] - coord2[0])**2 + (coord1[1] - coord2[1])**2)**(0.5) + (0 if array1 == array2 else SHIFT_COST)

def wordTotalDistance(word):
    sum = 0
    for i in range(len(word)-1):
        sum += euclideanKeyboardDistance(word[i], word[i+1])
    return sum

Markov Chain

In [8]:
class Dictogram(dict):
    def __init__(self, iterable=None):
        # Инициализируем наше распределение как новый объект класса, 
        # добавляем имеющиеся элементы
        super(Dictogram, self).__init__()
        self.types = 0  # число уникальных ключей в распределении
        self.tokens = 0  # общее количество всех слов в распределении
        if iterable:
            self.update(iterable)

    def update(self, iterable):
        # Обновляем распределение элементами из имеющегося 
        # итерируемого набора данных
        for item in iterable:
            if item in self:
                self[item] += 1
                self.tokens += 1
            else:
                self[item] = 2 # Laplass smooth
                self.types += 1
                self.tokens += 2 # Laplass smooth

class MarkovChain(dict):
    def __init__(self, power=1):
        super(MarkovChain, self).__init__()
        self.power = power - 1
    
    def fit(self, words):
        for word in words:
            w = ' ' * self.power + str(word).lower() +' ' * self.power
            for sh in range(self.power+2):
                grams = [w[i:i+self.power+1] for i in range(sh,len(w)-self.power+2-sh)]
                for c1, c2 in zip(grams, grams[1+self.power:]):
                    if c1 in self:
                        # Просто присоединяем к уже существующему распределению
                        self[c1].update([c2])
                    else:
                        self[c1] = Dictogram([c2])
        
    def get_pair_proba(self, ch1, ch2):
        if ch1 in self:
            if ch2 in self[ch1]:
                return self[ch1][ch2]/self[ch1].tokens
            else:
                return 1/(self[ch1].tokens + 1)
        else:
            return 1.0
        
    def proba(self, word):
        w = ' ' * self.power + str(word).lower() +' ' * self.power
        res =[]
        for sh in range(self.power+2):
            grams = [w[i:i+self.power+1] for i in range(sh,len(w)-self.power+2-sh)]
            res.append(np.prod([self.get_pair_proba(c1, c2) for c1, c2 in zip(grams, grams[1+self.power:])]))
        return max(res)

In [9]:
all_paswords = x_train['Password'].to_list() + x_test['Password'].to_list()

In [None]:
# markov = MarkovChain()
# markov.fit(all_paswords)
# pickle.dump(markov, open('markov.pkl', 'wb'))
# markov_2 = MarkovChain(2)
# markov_2.fit(all_paswords)
# pickle.dump(markov_2, open('markov_2.pkl', 'wb'))
# markov_3 = MarkovChain(3)
# markov_3.fit(all_paswords)
# pickle.dump(markov_3, open('markov_3.pkl', 'wb'))

In [10]:
if os.path.isfile('markov.pkl'):
    markov = pickle.load(open('markov.pkl', 'rb'))
else:
    markov = MarkovChain()
    markov.fit(all_paswords)
    pickle.dump(markov, open('markov.pkl', 'wb'))

In [11]:
if os.path.isfile('markov_2.pkl'):
    markov_2 = pickle.load(open('markov_2.pkl', 'rb'))
else:
    markov_2 = MarkovChain(2)
    markov_2.fit(all_paswords)
    pickle.dump(markov_2, open('markov_2.pkl', 'wb'))

In [12]:
if os.path.isfile('markov_3.pkl'):
    markov_3 = pickle.load(open('markov_3.pkl', 'rb'))
else:
    markov_3 = MarkovChain(3)
    markov_3.fit(all_paswords)
    pickle.dump(markov_3, open('markov_3.pkl', 'wb'))

In [13]:
if os.path.isfile('markov_4.pkl'):
    markov_4 = pickle.load(open('markov_4.pkl', 'rb'))
else:
    markov_4 = MarkovChain(4)
    markov_4.fit(all_paswords)
    pickle.dump(markov_3, open('markov_4.pkl', 'wb'))

In [14]:
def gen_features(in_data):
    data = in_data.copy()
    data['Password'] = data['Password'].apply(lambda x: str(x))  #convert to string
    cols = data.columns
    if 'len' not in cols:
        data['len'] = data['Password'].apply(lambda x: len(x)) #get len
    
    if 'len_low' not in cols:
        data['len_low'] = data['Password'].apply(lambda x: len(re.findall(r"[a-z]", x)))
        data['is_low'] = (data['len_low'] > 0).astype(int)
        data['p_low'] = data['len_low']/data['len'] 
    
    if 'len_caps' not in cols:
        data['len_caps'] = data['Password'].apply(lambda x: len(re.findall(r"[A-Z]", x)))
        data['is_caps'] = (data['len_caps'] > 0).astype(int)
        data['p_caps'] = data['len_caps']/data['len']
        
    if 'len_numb' not in cols:
        data['len_numb'] = data['Password'].apply(lambda x: len(re.findall(r"[0-9]", x)))
        data['is_numb'] = (data['len_numb'] > 0).astype(int)
        data['p_numb'] = data['len_numb']/data['len'] 
    
    if 'len_spec' not in cols:
        data['len_spec'] = data['Password'].apply(lambda x: len(re.findall(r"[^a-zA-Z0-9]", x)))
        data['is_spec'] = (data['len_spec'] > 0).astype(int)
        data['p_spec'] = data['len_spec']/data['len'] 
    
    if 'len_uniq' not in cols:
        data['len_uniq'] = data['Password'].apply(lambda x: len(set(x)))
        data['p_uniq'] = data['len_uniq']/data['len'] 
    
    if 'score' not in cols:
        data['score'] = data['is_low'] + data['is_caps'] + data['is_numb'] + data['is_spec']
        
    if 'w_distance' not in cols:   
        data['w_distance'] = data['Password'].apply(wordTotalDistance)
        
    if 'markov_proba' not in cols: 
        data['markov_proba'] = data['Password'].apply(markov.proba)
        
    if 'markov_proba_2' not in cols: 
        data['markov_proba_2'] = data['Password'].apply(markov_2.proba)
        
    if 'markov_proba_3' not in cols: 
        data['markov_proba_3'] = data['Password'].apply(markov_3.proba)
        
    if 'markov_proba_4' not in cols: 
        data['markov_proba_4'] = data['Password'].apply(markov_4.proba)
    
    #data.drop(columns=['Password'], inplace=True)
    
    return data

In [16]:
# x = gen_features(x)
# pickle.dump(x, open('x.pkl', 'wb'))

In [15]:
if os.path.isfile('x.pkl'):
    x = pickle.load(open('x.pkl', 'rb'))
else:
    x = gen_features(x_train)
    pickle.dump(x, open('x.pkl', 'wb'))

In [None]:
if os.path.isfile('y.pkl'):
    y = pickle.load(open('y.pkl', 'rb'))
else:
    train = pd.read_csv(train_file, dtype={'Password': str, 'Times': np.int32})
    y = np.log(train['Times']+1)
    pickle.dump(y, open('y.pkl', 'wb'))

## Train test split

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
X_train, X_val, y_train, y_val = train_test_split(x.drop(columns=['Password']), y, test_size=split_part, random_state = split_seed)

In [None]:
# del x

In [None]:
import gc
gc.collect()

## Train model

In [None]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.svm import LinearSVR

In [None]:
reg1 = LinearRegression(n_jobs=4)
reg1.fit(X_train, y_train)
y_train_pred1 = reg1.predict(X_train)
y_val_pred1 = reg1.predict(X_val)
print('rmlse_train: ', np.sqrt(mean_squared_error(y_train, y_train_pred1)))
print('rmlse_val: ', np.sqrt(mean_squared_error(y_val, y_val_pred1)))

In [None]:
X_train['lr_pred'] = y_train_pred1
X_val['lr_pred'] = y_val_pred1

In [None]:
#reg = RandomForestRegressor(random_state=model_seed, verbose=2, n_estimators=200, n_jobs=4, max_depth =20)
reg = LGBMRegressor(num_leaves=63, max_depth=-1, learning_rate=0.1, n_estimators = 500, n_jobs=4, random_state=model_seed)
reg.fit(X_train, y_train, verbose=2)

In [None]:
y_train_pred_orig = reg.predict(X_train)
y_val_pred_orig = reg.predict(X_val)

In [None]:
rmlse_train = np.sqrt(mean_squared_error(y_train, y_train_pred_orig))
rmlse_val = np.sqrt(mean_squared_error(y_val, y_val_pred_orig))
print('rmlse_train: ', rmlse_train)
print('rmlse_val: ', rmlse_val)

rmlse_train:  0.33885744320234273
rmlse_val:  0.3455145745624136

## Prepare answers and submit

In [None]:
reg1 = LinearRegression(n_jobs=4)
reg1.fit(x.drop(columns=['Password']), y)
x['lr_pred'] = reg1.predict(x.drop(columns=['Password']))
print('LR rmlse_train: ', np.sqrt(mean_squared_error(y, x['lr_pred'])))

In [None]:
reg = LGBMRegressor(num_leaves=63, max_depth=-1, learning_rate=0.1, n_estimators = 500, n_jobs=4, random_state=model_seed)
reg.fit(x.drop(columns=['Password']), y)
y_pred_x = reg.predict(x.drop(columns=['Password']))
print('LGB rmlse_train: ', np.sqrt(mean_squared_error(y, y_pred_x)))

In [None]:
#pickle.dump(x_test_fin, open('x_test.pkl', 'wb'))

In [None]:
# x_test_fin = pickle.load(open('x_test.pkl', 'rb'))

In [None]:
x_test_fin = gen_features(x_test_fin.drop(columns=['markov_proba', 'markov_proba_2', 'markov_proba_3']))
x_test_fin['lr_pred'] = reg1.predict(x_test_fin.drop(columns=['Password']))

In [None]:
y_pred = reg.predict(x_test_fin.drop(columns=['Password']))

In [None]:
y_pred = np.exp(y_pred) - 1

In [None]:
y_pred[y_pred < 1] = 1

In [None]:
output = pd.DataFrame({'Id': x_test.index,
                       'Times': y_pred})
output.to_csv('submission.csv', index=False)

In [None]:
# submit result
message = 'LR => LGBM default; error mean correction; updated loss approach; markov 1st, 2nd, 3rd chain'
result = api.competition_submit('submission.csv', message, competition)

# get last submissioon data
import time
last_result = api.competition_submissions(competition)[0]
while getattr(last_result, 'status') != 'complete':
    time.sleep(5)
    last_result = api.competition_submissions(competition)[0]
fields = ['date', 'description', 'status', 'publicScore', 'ref', 'submittedBy']

#dict to store detials
res_details = {}
for f in fields:
    res_details[f] = getattr(last_result, f)
print(res_details)