In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from xgboost import XGBRegressor

import nltk
from nltk.corpus import words

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean((np.log(y_true + 1) - np.log(y_pred + 1)) ** 2))

In [3]:
def save_submission(y_pred):
    y_pred_df = pd.DataFrame(y_pred, columns=["Times"])
    y_pred_df.index.name = "Id"
    y_pred_df.to_csv("data/y_test.csv")

In [4]:
words_set = set(words.words())

## Data

In [5]:
data = pd.read_csv("data/train.csv")
data.dropna(subset=["Password"], inplace=True)

In [6]:
X = data[["Password"]]
y = data["Times"]

In [7]:
MODE = 'test'
if MODE == 'validation':
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)
elif MODE == 'test':
    X_train = X.copy()
    y_train = y.copy()
    X_test = pd.read_csv('data/Xtest.csv', index_col=0).fillna("")

  mask |= (ar1 == a)


In [8]:
def preprocess_X(X_):
    X_ = X_.copy()
    X_["pass_len"] = X_["Password"].str.len()
    X_["numeric"] = X_["Password"].str.isdigit()
    X_["alphabetic"] = X_["Password"].str.isalpha()

    X_["lower_char"] = X_["Password"].str.islower()
    X_["upper_char"] = X_["Password"].str.isupper()

    X_["has_spec_char"] = X_["Password"].str.contains("^((?![a-zA-Z0-9]).)*$")

    X_["hard_pass"] = ~X_["numeric"] & ~X_["alphabetic"] & ~X_["lower_char"] & ~X_["upper_char"]

    X_["has_pass"] = X_["Password"].str.lower().str.contains("pass")
    X_["has_password"] = X_["Password"].str.lower().str.contains("password")
    X_["has_123"] = X_["Password"].str.contains("123")
    X_["has_12345"] = X_["Password"].str.contains("12345")
    X_["has_321"] = X_["Password"].str.contains("321")
    X_["has_54321"] = X_["Password"].str.contains("54321")
    X_["has_qazwsx"] = X_["Password"].str.lower().str.contains("qazwsx")
    X_["has_qwerty"] = X_["Password"].str.lower().str.contains("qwerty")
    X_["has_asdfgh"] = X_["Password"].str.lower().str.contains("asdfgh")
    X_["has_zxcvbn"] = X_["Password"].str.lower().str.contains("zxcvbn")

    X_["unique_symbols"] = X_["Password"].apply(lambda x: len(set(x)))
    X_["unique_symbols_share"] = (X_["unique_symbols"] / X_["pass_len"]).fillna(0)

    X_["complexity"] = (~X_["alphabetic"] * 10
                         + (~X_["numeric"] & ~X_["lower_char"]) * 26
                         + (~X_["numeric"] & ~X_["upper_char"]) * 26
                         + X_["has_spec_char"] * 32)
    X_["entropy"] = np.log(X_["complexity"]) / np.log(2) * X_["pass_len"]
    
    X_["is_word"] = X_["Password"].str.lower().apply(lambda x: x in words_set)
    X_["is_word_and_1sym"] = X_["Password"].str.lower().apply(lambda x: x[:-1] in words_set)
    X_["is_word_and_2sym"] = X_["Password"].str.lower().apply(lambda x: x[:-2] in words_set)
    X_["is_word_and_3sym"] = X_["Password"].str.lower().apply(lambda x: x[:-3] in words_set)
    
    X_["is_palindrome"] = X_["Password"] == X_["Password"].str[::-1]
#     X_["has_phone_number"] = X_["Password"].str.contains("89[0-8][0-9]{8}")
    X_["is_date"] = (X_["Password"].str.contains("^19[0-9]{2}[0-1][0-9][0-3][0-9]$")
                     & ~X_["Password"].str.contains("^19[0-9]{2}1[3-9][0-3][0-9]$")
                     | X_["Password"].str.contains("^20[0-9]{2}[0-1][0-9][0-3][0-9]$")
                     & ~X_["Password"].str.contains("^20[0-9]{2}1[3-9][0-3][0-9]$"))
    X_["is_date_with_decade"] = X_["Password"].loc[X_["is_date"]].copy().str[:3].astype(int)
    X_["is_date_with_decade"].fillna(0, inplace=True)

    X_.drop(["Password"], axis=1, inplace=True)
    
    return X_

In [9]:
%%time
X_train = preprocess_X(X_train)

  # Remove the CWD from sys.path while we load stuff.


Wall time: 2min 2s


In [10]:
%%time 
X_test = preprocess_X(X_test)

  # Remove the CWD from sys.path while we load stuff.


Wall time: 29.3 s


## Baseline

In [11]:
opt_const = np.exp(np.mean(np.log(y_train + 1))) - 1
print(f"Optimal score is {rmsle(y_train, opt_const)} with constant prediction of {opt_const}.")

Optimal score is 0.4092197443727306 with constant prediction of 1.265433528519321.


In [12]:
y_test_pred = pd.DataFrame(opt_const, index=X_test.index, columns=["Times"])
y_test_pred.index.name = "Id"
y_test_pred.to_csv("data/y_test.csv")

## Random Forest

In [13]:
X_train_ = X_train.astype(float)
X_test_ = X_test.astype(float)

In [87]:
%%time
np.sqrt(-cross_val_score(RandomForestRegressor(n_estimators=10, n_jobs=4, random_state=42),
                         X_train_, np.log(y_train + 1), scoring="neg_mean_squared_error", cv=3))

Wall time: 6min 6s


array([0.37622217, 0.37592861, 0.37641954])

In [14]:
%%time
model_rf = RandomForestRegressor(n_estimators=20, n_jobs=4, max_features=0.5, random_state=42)
model_rf.fit(X_train_, np.log(y_train + 1))

Wall time: 3min 58s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=0.5, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=4,
                      oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [16]:
print(f"Train score: {rmsle(y_train, np.exp(model_rf.predict(X_train_)) - 1)}")
if MODE == "validation":
    print(f"Test score: {rmsle(y_test, np.exp(model_rf.predict(X_test_)) - 1)}")

Train score: 0.3750783255866485


In [17]:
y_test_pred = np.exp(model_rf.predict(X_test_)) - 1
save_submission(y_test_pred)

## XGBoost

In [118]:
X_train_ = X_train.astype(float)

In [123]:
%%time
np.sqrt(-cross_val_score(XGBRegressor(n_estimators=100, max_depth=5, n_jobs=4, random_state=42),
                         X_train_, np.log(y_train + 1), scoring="neg_mean_squared_error", cv=3))

Wall time: 21min 39s


array([0.37616835, 0.37588593, 0.37646179])

In [128]:
%%time
model_gb = XGBRegressor(n_estimators=100, max_depth=7, n_jobs=4, random_state=42, learning_rate=0.5)
model_gb.fit(X_train_, np.log(y_train + 1))
print(f"Train score: {rmsle(y_train, np.exp(model_gb.predict(X_train_)) - 1)}")

Train score: 0.3750912985946533
Wall time: 13min 27s


In [129]:
print(f"Train score: {rmsle(y_train, np.exp((model_rf.predict(X_train_) + model_gb.predict(X_train_)) / 2) - 1)}")

Train score: 0.37506122756145993
