In [1]:
import pandas as pd
import numpy as np
import csv
from password_strength import PasswordStats
from nltk.corpus import words
from catboost import CatBoostRegressor, Pool, cv
import re

In [2]:
dfTrain = pd.read_csv('./train.csv')

In [3]:
def genFeatures(df):
    df['123Seq'] = df['Password'].apply(lambda x:1 if '123' in str(x) else 0)

    df['321Seq'] = df['Password'].apply(lambda x:1 if '321' in str(x) else 0)

    df['qweSeq'] = df['Password'].apply(lambda x:1 if 'qwe' in str(x) else 0)

    df['qaSeq'] = df['Password'].apply(lambda x:1 if 'qa' in str(x) else 0)

    df['wsSeq'] = df['Password'].apply(lambda x:1 if 'ws' in str(x) else 0)

    df['pasSeq'] = df['Password'].apply(lambda x:1 if 'pas' in str(x) else 0)
    
    df['lovSeq'] = df['Password'].apply(lambda x:1 if 'lov' in str(x) else 0)
    
    df['abcSeq'] = df['Password'].apply(lambda x:1 if 'abc' in str(x) else 0)
    
    df['youSeq'] = df['Password'].apply(lambda x:1 if 'you' in str(x) else 0)

    df['countZero'] = df['Password'].apply(lambda x: str(x).count('0'))

    df['countOne'] = df['Password'].apply(lambda x: str(x).count('1'))
    
    df['countTwo'] = df['Password'].apply(lambda x: str(x).count('2'))

    df['countA'] = df['Password'].apply(lambda x: str(x).lower().count('a'))
    
    df['countE'] = df['Password'].apply(lambda x: str(x).lower().count('e'))
    
    df['countI'] = df['Password'].apply(lambda x: str(x).lower().count('i'))
    
    df['countO'] = df['Password'].apply(lambda x: str(x).lower().count('o'))
    
    df['countN'] = df['Password'].apply(lambda x: str(x).lower().count('n'))
    
    df['countR'] = df['Password'].apply(lambda x: str(x).lower().count('r'))
    
    df['countS'] = df['Password'].apply(lambda x: str(x).lower().count('s'))

    df['sequencesLength'] = df['Password'].apply(lambda x:PasswordStats(str(x)).sequences_length)

    df['repeatedPatternsLength'] = df['Password'].apply(lambda x:PasswordStats(str(x)).repeated_patterns_length)

    df['weaknessFactor'] = df['Password'].apply(lambda x:PasswordStats(str(x)).weakness_factor)

    df['entropy'] = df['Password'].apply(lambda x:PasswordStats(str(x)).entropy_bits)

    df['charCount'] = df['Password'].apply(lambda x:len(list(str(x))))

    df['numerics'] = df['Password'].apply(lambda x: len([str(x) for x in list(str(x)) if str(x).isdigit()]))

    df['alpha'] = df['Password'].apply(lambda x: len([x for x in list(str(x)) if x.isalpha()]))

    vowels = ['a', 'e', 'i', 'o', 'u']
    df['vowels'] = df['Password'].apply(lambda x: len([x for x in list(str(x).lower()) if x in vowels]))

    df['consonants'] = df['Password'].apply(lambda x: (len([x for x in list(str(x).lower()) if x not in vowels and x.isalpha()])))

    specialSymbols = ['`', '~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')',
                      '-', '_','+', '=', ':', ';', ',', '.', '?', '//', '\\']
    df['specialSymbols'] = df['Password'].apply(lambda x: len([x for x in list(str(x)) if x in specialSymbols]))

    alf = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
    df['upperCaseNumber'] = df['Password'].apply(lambda x: (len([x for x in list(str(x)) if x in alf])))

    setOfWords = set(words.words())
    df['isRealWord'] = df['Password'].apply(lambda x: 1 if re.sub(r'[0-9]+', '', str(x).lower()) in setOfWords else 0)

    df['unicChars'] = df['Password'].apply(lambda x: len(set(str(x))))

    df['numToLenRatio'] = df['numerics']/df['charCount']

    df['alphaToLenRatio'] = df['alpha']/df['charCount']

    df['vowelsToLenRatio'] = df['vowels']/df['charCount']

    df['consonantsToLenRatio'] = df['consonants']/df['charCount']

    df['specialSymbolsToLenRatio'] = df['specialSymbols']/df['charCount']

    df['upperCaseNumberToLenRatio'] = df['upperCaseNumber']/df['charCount']

    df['unicCharsToLenRatio'] = df['unicChars']/df['charCount']

In [4]:
#Selection of features by feature importances
features = ['countZero', 'countOne', 'countTwo', 'weaknessFactor', 'entropy', 'charCount', 'numerics',
           'isRealWord', 'numToLenRatio', 'alphaToLenRatio', 'vowelsToLenRatio', 'upperCaseNumberToLenRatio']

In [5]:
genFeatures(dfTrain)

In [6]:
y = dfTrain['Times'].apply(lambda x: np.log(x + 1))

In [7]:
X = dfTrain[features]

In [17]:
#Validation of model
"""cv_dataset = Pool(X, y)
params = {'iterations': 200, 
          'depth': 7, 
          'loss_function': 'RMSE', 
          'l2_leaf_reg' : 20,
          'learning_rate' : 0.7,
          'verbose': False}
scores = cv(cv_dataset, params, plot=True)"""

"cv_dataset = Pool(X, y)\nparams = {'iterations': 200, \n          'depth': 7, \n          'loss_function': 'RMSE', \n          'l2_leaf_reg' : 20,\n          'learning_rate' : 0.7,\n          'verbose': False}\nscores = cv(cv_dataset, params, plot=True)"

In [9]:
model = CatBoostRegressor(iterations=200,
                          depth=7,
                          l2_leaf_reg=20,
                          learning_rate=0.7,
                          loss_function='RMSE',
                          logging_level='Silent')

In [10]:
model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x1c8cdfd0>

In [11]:
importance = model.feature_importances_
impDict = {}
for i in range(0, len(importance)):
    impDict[features[i]]=importance[i]

sorted(impDict.items(), key=lambda x: x[1], reverse=True)

[('alphaToLenRatio', 17.35144079108872),
 ('isRealWord', 15.264402375976449),
 ('numToLenRatio', 12.800019910262774),
 ('numerics', 10.497912547621727),
 ('charCount', 9.440575605251317),
 ('entropy', 7.675769762442507),
 ('countOne', 6.963002830929904),
 ('countZero', 6.916005201448985),
 ('weaknessFactor', 4.859685821889643),
 ('upperCaseNumberToLenRatio', 4.5582177696342105),
 ('countTwo', 1.9789757090489775),
 ('vowelsToLenRatio', 1.6939916744047774)]

In [12]:
dfTest = pd.read_csv('./Xtest.csv')

In [13]:
genFeatures(dfTest)

In [14]:
XTest = dfTest[features]

In [15]:
yPred = model.predict(XTest)

In [16]:
submission = dfTest['Id'].to_frame()
submission['Times'] = pd.DataFrame({'Times': (np.exp(yPred) - 1)})
submission[['Id', 'Times']].to_csv('test_boost_v9.txt', index=False)