# Feature generation

In [0]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression , SGDRegressor, LogisticRegression, Lasso
from sklearn import tree
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import json
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor

## Выделение морфологических фичей

In [0]:
data = pd.read_csv("train.csv")
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

data_test = pd.read_csv("Xtest.csv")
data_test.fillna("*****", inplace=True)

In [0]:
!pip install polyglot
!pip install morfessor

In [0]:
%%bash
polyglot download morph2.en morph2.ar

In [0]:
from polyglot.text import Text, Word

### Генерация частотного словаря

In [0]:
# Сгенерируем частотный словарь морфем
freq_dict = {}

In [0]:
for w in list(data.Password):
    w = Word(w, language="en")
    for morphem in w:
        if morphem in freq_dict:
            freq_dict[morphem] += 1
        else:
            freq_dict[morphem] = 1

### Создадим список 'осмысленных' морфем(те которые есть в словаре nltk)

In [0]:
morphem_dict = {}
for morphem in freq_dict:
    if morphem in words.words():
        morphem_dict[morphem] = 1
smart_morphemes = [k for k, v in freq_dict.items()]

### Cгенерируем для train и test признак - число морфем

In [0]:
test_morph_numbers = np.zeros(len(test_data))
i = 0
for w in list(data_test.Password):
    w = Word(w, language="en")
    test_morph_numbers[i] = len(w.morphemes)
    i += 1

In [0]:
morph_numbers = np.zeros(len(data))
i = 0
for w in list(data.Password):
    w = Word(w, language="en")
    morph_numbers[i] = len(w.morphemes)
    i += 1

# Создание датафрейма со всеми фичами(новые + морфологические)

In [0]:
'''
read_file = open('indicators.json')
indicators = np.array(json.load(read_file))
'''

In [0]:
read_file = open('morph_numbers.json')
morph_numbers = np.array(json.load(read_file))

In [0]:
# Загружаем частотный словарь
file = open("freq_dict.json")
freq_dict = json.load(file)
# Генерим список наиболее частых морфем длины 3 и по частоте более 1500 и не числа 
top_morphemes = [k for k, v in freq_dict.items() if v > 30500 and not k.isdigit() and len(k) >= 3]

In [0]:
def is_onesymb(word):
    c = word[0]
    for i in word:
        if i != c:
           return 0
    return 1

In [0]:
def is_sequence(digit):
  
    if (''.join(sorted(digit))[:5].find("12345") != -1):
        return 1
    if (''.join(sorted(digit))[:5].find("1234") != -1):
        return 1
    if (digit.find("qwer") != -1):
        return 1
    if (digit.find("zxcv") != -1):
        return 1
    return 0



In [0]:
def is_data(word):
    #"ddmmyyyy"
    if len(word) == 8:
        if ("01" <= word[:2] <= "31") and ("01" <= word[2:4] <= "12") and ("1900" <= word[4:] <= "2300"):
            return 1
    #"ddmmyy" or "dmyyyy"
    elif len(word) == 6:
        if ("01" <= word[:2] <= "31") and ("01" <= word[2:4] <= "12") and ("00" <= word[4:] <= "99"):
            return 1
        if ("1" <= word[:2] <= "9") and ("1" <= word[2:4] <= "9") and ("1900" <= word[4:] <= "2300"):
            return 1
    
    #"ddmyyyy"
    elif len(word) == 7:
        if ("01" <= word[:2] <= "31") and ("1" <= word[2:3] <= "9") and ("1900" <= word[4:] <= "2300"):
            return 1
    
    return 0

In [0]:
def Create_DF(data,array_morphemes, numbers):
    array_passwords = list(data.Password)
    length_array_passwords = len(array_passwords)
    array_lengths = np.full(length_array_passwords,-777)
    ####_letter
    number_letters = 0
    array_number_letters = np.full(length_array_passwords,-777) 
    ####_CAPS_letter
    number_CAPS_letters = 0
    array_number_CAPS_letters = np.full(length_array_passwords,-777)
    ####_digit
    number_digits = 0
    array_number_digits = np.full(length_array_passwords,-777)
    ####_symbol
    number_symbols = 0
    array_number_symbols = np.full(length_array_passwords,-777)
    ####______________
    array_isdigit = np.full(length_array_passwords, 0)
    array_ispolyndrom = np.full(length_array_passwords, 0)
    array_isdata = np.full(length_array_passwords, 0)
    array_issequence = np.full(length_array_passwords, 0)
    array_isonesymb = np.full(length_array_passwords, 0)

    matrix_morphemes = np.full((length_array_passwords,len(array_morphemes)),-777).astype(int)

    i=0
    k = 0
    while i != length_array_passwords:
        for char in array_passwords[i]:
            ord_char = ord(char)
            if (ord_char >= ord('0')) & (ord_char <= ord('9')):#digits
                number_digits += 1
            elif (ord_char >= ord('A')) & (ord_char <= ord('Z')):#CAPS_letter
                number_CAPS_letters += 1
            elif (ord_char >= ord('a')) & (ord_char <= ord('z')):#letter
                number_letters += 1
            else : #symbol
                number_symbols += 1
        ####_find_in_str
        
        ### Morphemes search
        for str_morphemes in array_morphemes:
            indicator = array_passwords[i].find(str_morphemes)
            if indicator == -1:
                matrix_morphemes[i][k] = 0
            else:
                matrix_morphemes[i][k] = 1
            k += 1 
        k = 0

        #ispolyndrom
        array_ispolyndrom[i] = (array_passwords[i] == array_passwords[i][::-1])
        #isdigit
        array_isdigit[i] = array_passwords[i].isdigit()
        if (array_isdigit[i]):
            array_isdata[i] = is_data(array_passwords[i])
        array_issequence[i] = is_sequence(array_passwords[i])
        array_isonesymb[i] = is_onesymb(array_passwords[i])

        ####_lenght
        array_lengths[i] = len(array_passwords[i]) 
        ####_digit
        array_number_digits[i] = number_digits
        number_digits = 0 
        ####_letter
        array_number_letters[i] = number_letters
        number_letters = 0
        ####_CAPS_letter
        array_number_CAPS_letters[i] = number_CAPS_letters
        number_CAPS_letters = 0
        ####_symbol
        array_number_symbols[i] = number_symbols
        number_symbols  = 0 
        i+=1

    data1 = pd.DataFrame({'Password'     : list(data.Password),
                         'lenght'        : array_lengths.astype(int),
                         'is_word'       : array_number_letters.astype(int) == array_lengths.astype(int),
                         'CAPS_letter'   : array_number_CAPS_letters.astype(int),
                         'is_digit'      : array_isdigit.astype(int),
                         'is_polyndrom'  : array_ispolyndrom.astype(int),
                         'is_data'       : array_isdata.astype(int),
                         'is_sequence'   : array_issequence.astype(int),
                          'onesymbol'    : array_isonesymb.astype(int),
                          'morph_numbers'     : numbers.astype(int),
                          })
    col1 = ['Password','lenght', 'is_digit', 'is_word', 'is_polyndrom', 'is_data',
            'is_sequence', 'onesymbol',  'morph_numbers' ]
    data1 = data1[col1]
    data2 = pd.DataFrame(matrix_morphemes , columns = array_morphemes)
    if "Times" in data:
      data = pd.concat([data1, data.Times],axis = 1)
      return data
    else:
      return data1


In [11]:
%%time
data_1 = Create_DF(data, top_morphemes, morph_numbers)

CPU times: user 38.5 s, sys: 214 ms, total: 38.7 s
Wall time: 38.7 s


In [12]:
data_1.head()

Unnamed: 0,Password,lenght,is_digit,is_word,is_polyndrom,is_data,is_sequence,onesymbol,morph_numbers,Times
0,631XniVx2lS5I,13,0,False,0,0,0,0,11,2
1,LEGIT747,8,0,False,0,0,0,0,6,1
2,742364es,8,0,False,0,0,0,0,5,1
3,3846696477,10,1,False,0,0,0,0,6,1
4,laurahop,8,0,True,0,0,0,0,3,2


# Tree

In [13]:
X = data_1.iloc[:, 1:-1]
y = data_1.iloc[:,-1:]
y = np.array(y)
y = np.log(1 + y)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=1)
data_1.head()

Unnamed: 0,Password,lenght,is_digit,is_word,is_polyndrom,is_data,is_sequence,onesymbol,morph_numbers,Times
0,631XniVx2lS5I,13,0,False,0,0,0,0,11,2
1,LEGIT747,8,0,False,0,0,0,0,6,1
2,742364es,8,0,False,0,0,0,0,5,1
3,3846696477,10,1,False,0,0,0,0,6,1
4,laurahop,8,0,True,0,0,0,0,3,2


In [0]:
clf = tree.DecisionTreeRegressor(max_depth=20, min_samples_leaf=3)

In [15]:
clf.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [16]:
prediction = clf.predict(X_test)
np.sqrt(mean_squared_error( y_test, prediction))

0.35885380668870676

# Random Forest

In [0]:
from sklearn.ensemble import RandomForestRegressor

In [0]:
param_grid = {'n_estimators' : [5, 10, 15],
              'min_samples_leaf': range(3, 5),
              'max_depth': range(7, 12)}
grid = RandomForestRegressor(n_estimators=3, max_depth=13, criterion='mse')

In [0]:
%%time
grid.fit(X_train,np.array(y_train).ravel())

CPU times: user 10.8 s, sys: 4.98 ms, total: 10.8 s
Wall time: 10.8 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=13,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=None,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [0]:
prediction = grid.predict(X_test)
np.sqrt(mean_squared_error( y_test, prediction))

0.35888781043403806

# XGB regression(The best one)

In [23]:
X = data_1.iloc[:, 1:-1]
y = data_1.iloc[:,-1:]
y = np.array(y)
y = np.log(1 + y)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=1)
data_1.head()

Unnamed: 0,Password,lenght,is_digit,is_word,is_polyndrom,is_data,is_sequence,onesymbol,morph_numbers,Times
0,631XniVx2lS5I,13,0,False,0,0,0,0,11,2
1,LEGIT747,8,0,False,0,0,0,0,6,1
2,742364es,8,0,False,0,0,0,0,5,1
3,3846696477,10,1,False,0,0,0,0,6,1
4,laurahop,8,0,True,0,0,0,0,3,2


In [0]:
gbregr = XGBRegressor(learning_rate=0.18, random_state=1)

In [0]:
%%time
gbregr.fit(X_train, y_train)

CPU times: user 2min 15s, sys: 265 ms, total: 2min 15s
Wall time: 2min 15s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.18, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=1,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [0]:
prediction = gbregr.predict(X_test)
np.sqrt(mean_squared_error( y_test, prediction))

0.36404312959155977

In [0]:
%%time
from sklearn.model_selection import RandomizedSearchCV

import scipy.stats as st

one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)

params = {  
    "n_estimators": st.randint(3, 40),
    "max_depth": st.randint(3, 40),
    "learning_rate": st.uniform(0.05, 0.4),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
}

xgbreg = XGBRegressor(nthread=-1)
gs = RandomizedSearchCV(xgbreg, params, n_jobs=1)
gs.fit(X_train, y_train)  
gs.best_params_  

In [26]:
gs.best_params_  

{'colsample_bytree': 0.9517160585200629,
 'gamma': 0.9786331427441008,
 'learning_rate': 0.4065549865794857,
 'max_depth': 31,
 'min_child_weight': 9.579856260788139,
 'n_estimators': 26,
 'reg_alpha': 14.589646699111253,
 'subsample': 0.9535587561843587}

In [27]:
gs.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9517160585200629,
             gamma=0.9786331427441008, importance_type='gain',
             learning_rate=0.4065549865794857, max_delta_step=0, max_depth=31,
             min_child_weight=9.579856260788139, missing=None, n_estimators=26,
             n_jobs=1, nthread=-1, objective='reg:linear', random_state=0,
             reg_alpha=14.589646699111253, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.9535587561843587, verbosity=1)

In [28]:
prediction = gs.best_estimator_.predict(X_test)
np.sqrt(mean_squared_error( y_test, prediction))

0.3589068912967524

# SGD_Regressor

In [0]:
param_grid = {'penalty' : ['none', 'l2', 'l1']
              }
grid = SGDRegressor(penalty='l2')

In [0]:
grid.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [0]:
prediction = grid.predict(X_test)
np.sqrt(mean_squared_error( y_test, prediction))

0.3783266903970637

In [0]:
grid.coef_

array([-0.0070942 ,  0.08044519,  0.04059449,  0.20484782,  1.17560447,
        0.07390095,  0.50806416, -0.022597  ])

# Classification task


In [0]:
X = data_1.iloc[:, 1:-1]
y = data_1.iloc[:,-1:]
data_1.head()

Unnamed: 0,Password,lenght,number_letter,is_digit,is_polyndrom,is_data,is_sequence,onesymbol,morph_numbers,Times
0,631XniVx2lS5I,13,0,0,0,0,0,0,11.0,2
1,LEGIT747,8,0,0,0,0,0,0,6.0,1
2,742364es,8,0,0,0,0,0,0,5.0,1
3,3846696477,10,0,1,0,0,0,0,6.0,1
4,laurahop,8,1,0,0,0,0,0,3.0,2


In [0]:

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=1)
y_train1 = y_train.copy()
y_train = np.array(y_train).ravel()
y_train[y_train == 1] = 1
y_train[y_train == 2] = 2
y_train[y_train == 3] = 3
y_train[y_train == 4] = 4
y_train[y_train == 5] = 5
y_train[y_train == 6] = 6
y_train[y_train == 7] = 7
y_train[y_train == 8] = 8
y_train[y_train == 9] = 9
y_train[y_train == 10] = 10
y_train[(y_train > 10) * (y_train <= 100)] = 8
y_train[(y_train > 100) * (y_train <= 1000)] = 100
y_train[(y_train > 1000) * (y_train <= 50000)] = 1000
#y_train[(y_train > 10000)] = 10000

In [0]:
%%time
clf = tree.DecisionTreeClassifier(random_state=1, max_depth=17, min_samples_leaf=1, criterion='entropy')
clf.fit(X_train, y_train)

CPU times: user 3.92 s, sys: 2 ms, total: 3.92 s
Wall time: 3.92 s


In [0]:
prediction = np.array(clf.predict(X_test))
np.sqrt(mean_squared_log_error( y_test, prediction))

0.3898891250053455

In [0]:
%%time
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=7, max_depth=20, min_samples_leaf=2)
forest.fit(X_train, y_train)

CPU times: user 12.6 s, sys: 6.95 ms, total: 12.6 s
Wall time: 12.6 s


In [0]:

prediction = np.array(forest.predict(X_test))
np.sqrt(mean_squared_log_error( y_test, prediction ))

0.38983788378277096

In [0]:
clf = LogisticRegression(solver = 'saga', multi_class='multinomial', C=1)
clf.fit(X_train,y_train)

In [0]:
np.sqrt(mean_squared_log_error( y_test, clf.predict(X_test) ))

0.4284085522326836

### GradientBoosting

In [0]:
from sklearn.ensemble import GradientBoostingClassifier

In [0]:
gbc = GradientBoostingClassifier(n_estimators=5)

In [0]:
gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=5,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [0]:
prediction = np.array(gbc.predict(X_test))
np.sqrt(mean_squared_log_error( y_test, prediction ))

0.432870340767327

In [0]:
from xgboost import XGBClassifier

In [0]:
xgb = XGBClassifier()

In [0]:
%%time
xgb.fit(X_train, y_train)

CPU times: user 23min 8s, sys: 4.73 s, total: 23min 13s
Wall time: 23min 12s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
prediction = np.array(xgb.predict(X_test))
np.sqrt(mean_squared_log_error( y_test, prediction ))

0.4007398694050032

# Linear Regression

In [0]:
from sklearn.linear_model import LinearRegression

In [0]:
X = data_1.iloc[:, 1:-1]
y = data_1.iloc[:,-1:]
y = np.array(y)
y = np.log(1 + y)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=1)
data_1.head()

Unnamed: 0,Password,lenght,is_polyndrom,is_data,is_sequence,onesymbol,morph_numbers,Times
0,631XniVx2lS5I,13,0,0,0,0,11,2
1,LEGIT747,8,0,0,0,0,6,1
2,742364es,8,0,0,0,0,5,1
3,3846696477,10,0,0,0,0,6,1
4,laurahop,8,0,0,0,0,3,2


In [0]:
lrg = LinearRegression(normalize=True)
lrg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [0]:
prediction = lrg.predict(X_test)
np.sqrt(mean_squared_error( y_test, prediction))

0.3775823387397288

# Stacking

In [29]:
X = data_1.iloc[:, 1:-1]
y = data_1.iloc[:,-1:]
y = np.array(y)
y = np.log(1 + y)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=1)
data_1.head()

Unnamed: 0,Password,lenght,is_polyndrom,is_data,is_sequence,onesymbol,morph_numbers,Times
0,631XniVx2lS5I,13,0,0,0,0,11,2
1,LEGIT747,8,0,0,0,0,6,1
2,742364es,8,0,0,0,0,5,1
3,3846696477,10,0,0,0,0,6,1
4,laurahop,8,0,0,0,0,3,2


In [0]:
from mlxtend.regressor import StackingRegressor

In [0]:
lr = LinearRegression()
extratree = ExtraTreesRegressor(n_estimators=7, max_depth=25)
tree_regr = tree.DecisionTreeRegressor(max_depth=20, min_samples_leaf=3)
lasso = Lasso(alpha=0.9, max_iter=1000, random_state=1)
forest = RandomForestRegressor(n_estimators=7, random_state=1)
sgd = SGDRegressor()
gbregr = XGBRegressor(learning_rate=0.18, random_state=1)

stregr = StackingRegressor(regressors=[tree_regr, lr, forest], meta_regressor=tree_regr)

In [35]:
%%time
stregr.fit(X_train, y_train)

  regr.fit(X, y)


CPU times: user 24.7 s, sys: 966 ms, total: 25.6 s
Wall time: 24.5 s


StackingRegressor(meta_regressor=DecisionTreeRegressor(criterion='mse',
                                                       max_depth=20,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=3,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort=False,
                                                       random_state=None,
                                                       splitter='best'),
                  refit=True,
                  regressors=[DecisionTreeRegressor(criterion='m

In [36]:
prediction = stregr.predict(X_test)
np.sqrt(mean_squared_error( y_test, prediction))

0.36391031183580097

# Analysis

In [0]:
df = pd.DataFrame({"predict": prediction, "pass":list(data.Password[y_test.index])})
df.join(y_test.reset_index(drop=True)).sort_values(by=["Times"])[-5000:]

Unnamed: 0,predict,pass,Times
642510,1,protein,26
216494,1,mom4u4mm,26
714561,1,74123,26
639587,10,101181,26
639475,10,12061975,26
594471,1,outlook,26
406881,1,lkjhgfd,26
89921,1,eagleeye,26
65310,1,24692469,26
290274,1,JAGUAR,26


In [0]:
data_1.sort_values(by=["Times"])[-8000:]

In [0]:
import nltk
from nltk.corpus import words
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [0]:
"Letmein" in words.words()


False

In [0]:
data_1[data_1.number_letter == 1][-200:]

Unnamed: 0,Password,lenght,number_letter,is_digit,is_polyndrom,is_data,is_sequence,onesymbol,indicators,morph_numbers,Times
4150725,treide,6,1,0,0,0,0,0,0,3.0,1
4150730,croclook,8,1,0,0,0,0,0,1,4.0,1
4150742,traplik,7,1,0,0,0,0,0,1,2.0,1
4150757,dillsboy,8,1,0,0,0,0,0,1,4.0,2
4150761,sidereg,7,1,0,0,0,0,0,1,2.0,1
4150765,ggthekin,8,1,0,0,0,0,0,1,4.0,1
4150766,andrzejeski,11,1,0,0,0,0,0,1,4.0,1
4150767,smpdfgsmpdfg,12,1,0,0,0,0,0,0,8.0,1
4150768,beloit,6,1,0,0,0,0,0,1,2.0,16
4150776,renodoug,8,1,0,0,0,0,0,0,4.0,1


# Test

In [0]:
read_file = open('test_morph_numbers.json')
test_morph_numbers = np.array(json.load(read_file))

In [0]:
'''
read_file = open('test_indicators.json')
test_indicators = np.array(json.load(read_file))
'''

In [19]:
%%time
data_test = pd.read_csv("Xtest.csv")
data_test.fillna("*****", inplace=True)

data_test.reset_index(drop=True, inplace=True)
# There is problem
data_2 = Create_DF(data_test, top_morphemes, test_morph_numbers)


CPU times: user 10.5 s, sys: 53.9 ms, total: 10.5 s
Wall time: 10.5 s


In [22]:
data_2

Unnamed: 0,Password,lenght,is_digit,is_word,is_polyndrom,is_data,is_sequence,onesymbol,morph_numbers
0,ThaisCunha,10,0,False,0,0,0,0,5
1,697775113,9,1,False,0,0,0,0,5
2,922a16922a,10,0,False,0,0,0,0,8
3,andy74,6,0,False,0,0,0,0,4
4,joemack,7,0,True,0,0,0,0,4


In [0]:
X_send = data_2.iloc[:,1:]
prediction = np.array(gs.best_estimator_.predict(X_send)).ravel()
prediction = np.exp(prediction) - 1
answer = pd.DataFrame({"Id": np.arange(data_test.shape[0]), "Times":prediction})

In [0]:
answer.to_csv("answer_xgb_grid.csv", index=False)

In [45]:
answer.head()

Unnamed: 0,Id,Times
0,0,1.056527
1,1,1.080191
2,2,1.04997
3,3,1.274256
4,4,1.208216
