In [58]:
import time
import math
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy.sparse import hstack
from sklearn.preprocessing import MaxAbsScaler
from sklearn.ensemble import ExtraTreesRegressor
from scipy import stats
from IPython.display import Image
from sklearn.datasets import load_iris, load_boston
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set(style="ticks")

# Чтение данных

In [59]:
data1 = pd.read_csv("data/train.csv", delimiter=',')

In [60]:
data1.head()

Unnamed: 0,Password,Times
0,631XniVx2lS5I,2
1,LEGIT747,1
2,742364es,1
3,3846696477,1
4,laurahop,2


In [61]:
data = data1[data1['Password'].notnull()]

In [62]:
data[data['Password'].isnull()]

Unnamed: 0,Password,Times


In [63]:
%%time
data['pl'] = data.apply(lambda x: len(x['Password']), axis=1)

Wall time: 53.2 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [64]:
%%time
data['timeslg'] = data.apply(lambda x: math.log(float(x['Times']+1)), axis=1)

Wall time: 52.2 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [65]:
data_test = pd.read_csv("data/test.csv", delimiter=',')

In [66]:
data_test[data_test['Password'].isnull()]

Unnamed: 0,Id,Password
581317,581317,


In [67]:
data_test.fillna('', inplace=True)

In [68]:
%%time
data_test['pl'] = data_test.apply(lambda x: len(x['Password']), axis=1)

Wall time: 13.1 s


In [69]:
size_train1 = data1.shape[0]
size_train1

4151496

In [70]:
size_train = data.shape[0]
size_train

4151494

In [71]:
size_test = data_test.shape[0]
size_test

1037875

In [72]:
data_all = pd.concat([data[['Password', 'pl']], data_test[['Password', 'pl']]])

In [73]:
assert data_all.shape[0] == size_train + size_test

# Простая модель

In [103]:
def prep_data(mode, dx, ngram_range_param=(1,2)):
    if mode == 'TFIDF':
        vct = TfidfVectorizer(ngram_range=ngram_range_param, lowercase=False, analyzer='char')
    else:
        vct = CountVectorizer(ngram_range=ngram_range_param, lowercase=False, analyzer='char')
    mtr = vct.fit_transform(dx)
    return vct, mtr

In [101]:
def make_mtr_with_len(mtr, df_for_len):
    pl = df_for_len['pl'].values
    pl_xlen = pl.shape[0]
    pl = pl.reshape(pl_xlen,1)
    mtr_len = hstack([mtr, pl])
    return mtr_len

In [102]:
def crossval(regr, mtr, dy):
    scores = cross_val_score(regr, mtr, dy, scoring='neg_mean_squared_error', cv=3)
    scores2 = [math.sqrt(abs(x)) for x in scores]
    res = np.mean(scores2)
    return res

In [99]:
# mode := CNT | CNT_NORM | TFIDF
def test(mode, regr, dx, dy, ngram_range_param):
    t1 = time.time()
    vct, mtr = prep_data(mode, dx, ngram_range_param=ngram_range_param)
    
    mtr_len = mtr
    if mode != 'TFIDF':
        mtr_len = make_mtr_with_len(mtr, data)
    
    if mode == 'CNT_NORM':
        scaler = MaxAbsScaler()
        mtr_len_scaled = scaler.fit_transform(mtr_len)
    else:
        mtr_len_scaled = mtr_len
        
    t2 = time.time()
    metric = crossval(regr, mtr_len_scaled, dy)
    t3 = time.time()
    print(ngram_range_param, round(t2-t1,5), len(vct.get_feature_names()), round(t3-t2,5), abs(metric))

In [112]:
ranges = [(3,3), (3,4), (3,5), (3,6), (3,7), (3,8)]
dx = data['Password'].values
dy = data['timeslg'].values
for r in ranges:
    regr = SGDRegressor(max_iter=10000)
    test('CNT', regr, dx, dy, r)

(3, 3) 42.01268 278783 33.13641 0.39373599445693325
(3, 4) 100.24803 3447735 49.18448 0.39088849193260095
(3, 5) 165.50454 9724000 62.7682 0.3896813284989278
(3, 6) 233.18062 18003064 76.1953 0.39000208352918414


KeyboardInterrupt: 

## Кросс-валидация 

### Варианты векторизации

#### CountVectorizer без масштабирования

- (1, 1) 25.7362 97 26.95394 0.4024887735223093 (public score = 0.40558)
- (1, 2) 49.05489 6362 37.55855 0.39842204072813486
- (1, 3) 81.0733 285145 50.15662 0.3922787120891369
- (1, 4) 133.97283 3454097 67.8283 0.38913771220441956
- (1, 5) 202.01593 9730362 80.52773 0.38812025825466984 (public score = 0.39059)

**Ошибка между public score и рассчитанным значением около 0.003**

#### CountVectorizer с масштабированием (хуже чем без масшабирования)

- (1, 1) 29.5749 97 25.16173 0.4016805599798925
- (1, 2) 56.91584 6362 32.24679 0.3979992396997387
- (1, 3) 96.59076 285145 40.37506 0.3943876998371207
- (1, 4) 164.04843 3454097 57.23698 0.39200579288335996
- (1, 5) 237.59481 9730362 68.91077 0.3911842821339894

#### TfidfVectorizer

- (1, 1) 26.15211 97 24.28105 0.40712845598165326
- (1, 2) 50.12702 6362 28.72816 0.40429379279551075
- (1, 3) 81.55999 285145 32.50408 0.4023725819158713
- (1, 4) 153.65122 3454097 43.53063 0.4007192939804855
- (1, 5) 232.02674 9730362 52.2762 0.39881435223439055

### Подбор нграмм для CountVectorizer без масштабирования

- (1, 1) 25.7362 97 26.95394 0.4024887735223093 (public score = 0.40558)
- (1, 2) 49.05489 6362 37.55855 0.39842204072813486
- (1, 3) 81.0733 285145 50.15662 0.3922787120891369
- (1, 4) 133.97283 3454097 67.8283 0.38913771220441956
- (1, 5) 202.01593 9730362 80.52773 0.38812025825466984 (public score = 0.39059)
- (1, 6) 283.77105 18009426 93.33947 0.3886473827695837
- (1, 7) 325.74812 25714667 95.99437 0.38822769985118494
- (1, 8) 360.68475 31478469 99.73337 0.38817812990270945


- (2, 2) 34.29432 6265 28.66437 0.3969665119491872
- (2, 3) 63.8154 285048 41.56887 0.39215269435101024
- (2, 4) 125.76008 3454000 58.45369 0.3913767707174314
- (2, 5) 186.3927 9730265 76.36086 0.38943391829694524
- (2, 6) 262.76254 18009329 83.77103 0.3910493968722066


- (3, 3) 42.01268 278783 33.13641 0.39373599445693325
- (3, 4) 100.24803 3447735 49.18448 0.39088849193260095
- (3, 5) 165.50454 9724000 62.7682 0.3896813284989278
- (3, 6) 233.18062 18003064 76.1953 0.39000208352918414

#### DecisionTreeRegressor(min_samples_split=2, max_depth=30)

- (1, 2) public score = 0.39668


# Отправка решения

In [107]:
def save_res(y_pred, fn):
    df_res = data_test[['Id']]
    df_res['Times'] = y_pred
    df_res.to_csv('data/res/' + fn + '.csv', sep=',', index=None)

In [108]:
def fit_predict(regr, ngram_range_param=(1,1)):
    dx = data_all['Password'].values
    dy = data['timeslg'].values
    vct, mtr = prep_data('CNT', dx, ngram_range_param=ngram_range_param)
    mtr_len = make_mtr_with_len(mtr, data_all)
    mtr_res = mtr_len.tocsr()
    X_train = mtr_res[0:size_train,:]
    X_test = mtr_res[size_train:,:]
    print('preprocessed')
    regr.fit(X_train, dy)
    print('fit')
    y_pred = regr.predict(X_test)
    exp_y_pred = np.expm1(y_pred)
    return exp_y_pred

In [113]:
%%time
regr = SGDRegressor(max_iter=10000)
y_pred_sgd = fit_predict(regr, ngram_range_param=(1,5))
save_res(y_pred_sgd, 'mse_sgd_15')

preprocessed
fit


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Wall time: 10min 4s


In [None]:
#tree = RandomForestRegressor()
#tree = DecisionTreeRegressor(min_samples_split=2, max_depth=30)
#tree = ExtraTreesRegressor(min_samples_split=2, max_depth=25)

In [114]:
%%time
regr = DecisionTreeRegressor(min_samples_split=2, max_depth=30)
y_pred_sgd = fit_predict(regr, ngram_range_param=(1,2))
save_res(y_pred_sgd, 'mse_tree_12')

preprocessed
fit


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Wall time: 1h 9min


# Выводы

1. Если использовать в качестве признаков только одиночные буквы (без нграмм) и слабую модель SGDRegressor, но правильно преобразовать целевой признак, то уже удается превысить константный baseline.
1. Недостаток использования нграмм состоит в том, что получается очень большое количество признаков, и сложные модели учатся очень долго.
1. Лучшим вариантом оказался SGDRegressor на нграммах (1, 5), public score = 0.39059
