In [1]:
import numpy as np
import pandas as pd
import enchant
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('train.csv').values
english_words = enchant.Dict("en_US")

In [4]:
def get_year(y):
    if 0 <= y <= 18:
        return 18 - y
    return 118 - y

def check_first_format(password):
    d, m, y = map(int, [password[0:2], password[2:4], password[4:]])
    if d <= 31 and m <= 12 and (60 <= y <= 99 or 0 <= y <= 18):
        return get_year(y)
    return 0

def check_second_format(password):
    d, m, y = map(int, [password[0:2], password[2:4], password[4:]])
    if d <= 31 and m <= 12 and 1960 <= y <= 2018:
        return get_year(y)
    return 0

In [5]:
def is_birthday(password):
    if len(password) != 6 and len(password) != 8:
        return 0
    if not password.isdigit():
        return 0
    if len(password) == 6:
        return check_first_format(password)
    else:
        return check_second_format(password)

In [6]:
def get_features(password):
    if type(password) is not str:
        return np.array([0, 0, 0, 0])
    bd = is_birthday(password)
    ln = len(password)
    dc = sum(c.isdigit() for c in password)
    cleaned = ''.join([i for i in password if not i.isdigit()])
    cc = 1 if cleaned and english_words.check(cleaned) else 0
    return np.array([bd, ln, dc, cc], dtype=int)

In [7]:
def build_features(X):
    return np.array([get_features(x) for x in X])

In [8]:
X, y = data[:, 0], data[:, 1]

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(build_features(X), y, test_size=0.2, random_state=42)

In [26]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [25]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor

model = DecisionTreeRegressor(random_state=42, criterion='mse', min_samples_split=2)

In [11]:
model.fit(X_train, np.log(y_train + 1))

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [108]:
from sklearn.metrics import mean_squared_log_error
score = np.sqrt(mean_squared_log_error(y_test, np.exp(y_pred) - 1))

In [109]:
score

0.34271955271234095

In [13]:
model.feature_importances_

array([0.63349916, 0.1104646 , 0.11641733, 0.13961891])

In [14]:
test_data = pd.read_csv('XTest.csv')

In [15]:
test_data.head()

Unnamed: 0,Id,Password
0,0,ThaisCunha
1,1,697775113
2,2,922a16922a
3,3,andy74
4,4,joemack


In [17]:
XX = build_features(test_data['Password'].values)

In [36]:
pd.DataFrame(XX).to_csv("XX.csv")

In [19]:
yy = np.exp(model.predict(XX)) - 1

In [20]:
d = {'Times': yy}

In [21]:
ans = pd.DataFrame(data=d)

In [22]:
ans.head()

Unnamed: 0,Times
0,1.143648
1,1.101263
2,1.03644
3,1.194335
4,1.416848


In [23]:
ans.to_csv('ans.csv', index=True, index_label='Id')

In [35]:
tricky = data[data[:,1] > 100][:,0]

In [36]:
tricky[:13]

array(['maxx', 'small', 'Superman', 'dang', 'parrot', 'freeman',
       'jackson1', 'hardcore', 'bugger', 'killer1', 'amelia', '12121212',
       'phillies'], dtype=object)

In [116]:
tricky_y = np.exp(model.predict(build_features(tricky)))

In [38]:
len(tricky)

3712

In [117]:
np.mean(tricky_y)

4.163007964681628