In [693]:
# Import libraries and set desired options
import pickle
from pathlib2 import Path
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
%config InlineBackend.figure_format = 'retina'

In [46]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [47]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.9):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver='liblinear').fit(X[:idx, :], y[:idx])
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [48]:
def time_cross_val(X_train, y_train, estimator, n_splits=10):
    time_split = TimeSeriesSplit(n_splits=n_splits)
    cv_scores = cross_val_score(estimator, X_train, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=1)
    return cv_scores

In [804]:
def new_features(df_in):
    df = pd.DataFrame()
    df['morning'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 7 and ts.hour <= 11 else 0)
    df['day'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 12 and ts.hour <= 18 else 0)
    df['evening'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 19 and ts.hour <= 23 else 0)
    df['night'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 0 and ts.hour <= 6 else 0)
    
    pi = np.pi
    df['hour'] = df_in['time1'].apply(lambda ts: ts.hour)
    df['hour_sin'] = df['hour'].apply(lambda ts: np.sin(2*pi*ts/24.))
    df['hour_cos'] = df['hour'].apply(lambda ts: np.cos(2*pi*ts/24.))
    df['hour_sin_cos'] = df['hour_sin'] * df['hour_cos']
    df['hour_sin'] = StandardScaler().fit_transform(df['hour_sin'].values.reshape(-1, 1))
    df['hour_cos'] = StandardScaler().fit_transform(df['hour_cos'].values.reshape(-1, 1))
    df['hour_sin_cos'] = StandardScaler().fit_transform(df['hour_sin_cos'].values.reshape(-1, 1))
    df = pd.get_dummies(df, columns=['hour'])
    
    
    
    df['start_month'] = df_in['time1'].apply(lambda ts: 100 * ts.year + ts.month).astype('float64')
    df = pd.get_dummies(df, columns=['start_month'])
    df['start_month'] = df_in['time1'].apply(lambda ts: 100 * ts.year + ts.month).astype('float64')
    df['start_month'] = StandardScaler().fit_transform(df[['start_month']])
    
    df['day_of_week'] = df_in['time1'].apply(lambda ts: ts.dayofweek).astype('float64')
    df = pd.get_dummies(df, columns=['day_of_week'])
    
    df['session_lenth'] = (full_times.max(axis=1) - full_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
    df['session_lenth'] = StandardScaler().fit_transform(df['session_lenth'].values.reshape(-1, 1))
    
    #Getting Unique Counts
    unique_count = []
    for row in full_sites.values:
        unique = np.unique(row)
        if 0 in unique:
            unique_count.append(len(unique) - 1)
        else:
            unique_count.append(len(unique))
    unique_count = np.array(unique_count).reshape(-1,1)
    df['unique'] = unique_count 
    #df['unique'] = StandardScaler().fit_transform(df['unique'].values.reshape(-1, 1))
    
    times = ['time%s' % i for i in range(1, 11)]
    for time in times:
        name = 'ps_' + time
        df[name] = df_in[time].apply(lambda ts: ts.timestamp() if isinstance(ts, pd.Timestamp) else 0)
        df[name] = StandardScaler().fit_transform(df[name].values.reshape(-1, 1))
        
    def top_sites_count(sites):
        sites_count = 0
        top_sites = [77, 80, 76, 29, 21, 81, 879, 22, 75, 82, 23, 35, 881, 37, 33, 3000, 733, 30, 78, 941]
        for site in sites:
            if site in top_sites:
                sites_count += 1
        return sites_count
    
    def top_unic_sites_count(sites):
        unic_sites_count = 0
        top_sites = [77, 80, 76, 29, 21, 81, 879, 22, 75, 82, 23, 35, 881, 37, 33, 3000, 733, 30, 78, 941]
        used_sites = []
        for site in sites:
            if site in top_sites and site not in used_sites:
                used_sites.append(site)
                unic_sites_count += 1
        return unic_sites_count
    
    sites = ['site%s' % i for i in range(1, 11)]
    
    df['top_sites'] = df_in[sites].apply(lambda sites: top_sites_count(sites), axis=1)
    df['unic_top_sites'] = df_in[sites].apply(lambda sites: top_unic_sites_count(sites), axis=1)
    
    df['power_top_sites'] = df['top_sites'] * df['unic_top_sites']
    
    #df = df.drop('top_sites', axis=1)
    #df = df.drop('unic_top_sites', axis=1)
    
    df['top_sites'] = StandardScaler().fit_transform(df['top_sites'].values.reshape(-1, 1))
    df['unic_top_sites'] = StandardScaler().fit_transform(df['unic_top_sites'].values.reshape(-1, 1))
    df['power_top_sites'] = StandardScaler().fit_transform(df['power_top_sites'].values.reshape(-1, 1))
     
    return df

In [668]:
def add_feature_and_df_split(base_sparce_df, new_feature_df, split:int):
    X_train = hstack([base_sparce_df[:split,:], new_feature_df[:split]])
    X_test = hstack([base_sparce_df[split:,:], new_feature_df[split:]])
    return X_train, X_test

In [669]:
def delta_times(times_df):
    delta_time_df = pd.DataFrame()
    count_of_colums = len(times_df.columns)
    for i in range(1, count_of_colums):
        column_name = f'd_time{i}'
        delta_time_df[column_name] = times_df.apply(lambda ts_columns: (ts_columns[f'time{i+1}'] - ts_columns[f'time{i}']).total_seconds() if not pd.isnull(ts_columns[f'time{i+1}']) else 0, axis=1)
        delta_time_df[column_name] = StandardScaler().fit_transform(delta_time_df[column_name].values.reshape(-1, 1))
    return delta_time_df

In [670]:
NUM_SITES = 10

In [671]:
# Read the training and test data sets, change paths if needed
PATH_TO_DATA = Path(r'D:\Programming\DS\mlcourse\course\jupyter_english\assignments_fall2019\Alice')

times_train = ['time%s' % i for i in range(1, NUM_SITES + 1)]
train_df = pd.read_csv(PATH_TO_DATA / 'train_sessions3.csv',
                       index_col='session_id', parse_dates=times_train)

#times_drop = ['time%s' % i for i in range(11, NUM_SITES + 1)]
#train_df = train_df.drop(times_drop, axis=1)
#train_df['is_train'] = 1

times_test = ['time%s' % i for i in range(1, 11)]
test_df = pd.read_csv(PATH_TO_DATA / 'test_sessions3.csv',
                      index_col='session_id', parse_dates=times_test)
test_df['is_train'] = 0

# Sort the data by time
train_df = train_df.sort_values(by='time1')

In [672]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, NUM_SITES + 1)]
train_df[sites] = train_df[sites].fillna(0).astype(np.uint16)

# Load websites dictionary
with open(PATH_TO_DATA / 'site_dic.pkl', "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), 
                          columns=['site'])

In [673]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, 11)]
test_df[sites] = test_df[sites].fillna(0).astype(np.uint16)

In [674]:
# United dataframe of the initial data 
sites = ['site%s' % i for i in range(1, NUM_SITES + 1)]
full_df = pd.concat([train_df.drop('target', axis=1), test_df], sort=False)
full_df[sites] = full_df[sites].fillna(0).astype(np.uint16)
# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [675]:
full_sites = full_df[sites]

In [676]:
# create an inverse id _> site mapping
id2site = {v:k for (k, v) in site_dict.items()}
# we treat site with id 0 as "unknown"
id2site[0] = 'unknown'

full_sites_names = full_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                 ' '.join([id2site[i] for i in row]), axis=1).tolist()

In [731]:
# we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
# so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
# to be split into 'mail', 'google' and 'com')

vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}

vectorizer = CountVectorizer(**vectorizer_params)
full_sites_vect = vectorizer.fit_transform(full_sites_names)

y_train = train_df['target'].apply(lambda y: 1 if y > 0 else 0)

In [696]:
# we'll need site visit times for further feature engineering
times = ['time%s' % i for i in range(1, 11)]
full_times = full_df[times]

In [805]:
new_features_df = new_features(full_df)

In [806]:
from sklearn.decomposition import TruncatedSVD
TSVD = TruncatedSVD(n_components=5, n_iter=7, random_state=17)

In [807]:
# we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
# so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
# to be split into 'mail', 'google' and 'com')

vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}

vectorizer_sel = TfidfVectorizer(**vectorizer_params)
full_sites_vect_sel = vectorizer.fit_transform(full_sites_names)

new_full_matrix = pd.DataFrame(TSVD.fit_transform(full_sites_vect_sel), index=full_df.index)

In [808]:
new_features_df = pd.concat([new_features_df, new_full_matrix], axis=1)

In [809]:
X_train, X_test = add_feature_and_df_split(full_sites_vect, new_features_df, idx_split)

In [810]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X_train, y_train)
# display the relative importance of each attribute
print(model.feature_importances_)



[0.00000000e+00 1.25230334e-04 9.24880006e-10 ... 6.27017119e-03
 6.77251049e-03 7.33349977e-03]


In [828]:
from sklearn.feature_selection import RFE

model_lr = LogisticRegression(random_state=17, solver='liblinear')

df_sel  = new_features_df[:idx_split]

# create the RFE model and select 3 attributes
rfe = RFE(model_lr, 15)
rfe = rfe.fit(df_sel, y_train)
# summarize the selection of the attributes

print("Num Features: ", rfe.n_features_ )
print("Selected Features: ", rfe.support_ )
print("Feature Ranking: ", rfe.ranking_ )

Num Features:  15
Selected Features:  [ True False  True False False False False False  True  True  True False
 False False  True  True False False False False False False False False
  True False  True False False False False  True  True  True False False
 False False False False False False False False False False False False
 False False False  True  True False False  True False False False False
 False False False False False False False False False False False False
 False False False False]
Feature Ranking:  [ 1 10  1 58 30 38 39 52  1  1  1 19  5 46  1  1  4  3  2 29 37 22 40 48
  1 27  1 31  9  6 50  1  1  1 47 32  7 42 36 41 59 60 56 57 55 54 62 61
 34  8 28  1  1 43 44  1 23 53 33 25 15 24 17 13 18 16 26 35 21 45 51 14
 12 11 20 49]


In [830]:
features = list(new_features_df.columns)
selected_f = []
f_analyze = {}
for i in range(len(features)):
    f_analyze[features[i]] = rfe.support_[i]
    if rfe.support_[i]:
        selected_f.append(features[i])

In [831]:
f_analyze

{'morning': True,
 'day': False,
 'evening': True,
 'night': False,
 'hour_sin': False,
 'hour_cos': False,
 'hour_sin_cos': False,
 'hour_7': False,
 'hour_8': True,
 'hour_9': True,
 'hour_10': True,
 'hour_11': False,
 'hour_12': False,
 'hour_13': False,
 'hour_14': True,
 'hour_15': True,
 'hour_16': False,
 'hour_17': False,
 'hour_18': False,
 'hour_19': False,
 'hour_20': False,
 'hour_21': False,
 'hour_22': False,
 'hour_23': False,
 'start_month_201301.0': True,
 'start_month_201302.0': False,
 'start_month_201303.0': True,
 'start_month_201304.0': False,
 'start_month_201305.0': False,
 'start_month_201306.0': False,
 'start_month_201307.0': False,
 'start_month_201308.0': True,
 'start_month_201309.0': True,
 'start_month_201310.0': True,
 'start_month_201311.0': False,
 'start_month_201312.0': False,
 'start_month_201401.0': False,
 'start_month_201402.0': False,
 'start_month_201403.0': False,
 'start_month_201404.0': False,
 'start_month_201405.0': False,
 'start_month_

In [832]:
selected_f

['morning',
 'evening',
 'hour_8',
 'hour_9',
 'hour_10',
 'hour_14',
 'hour_15',
 'start_month_201301.0',
 'start_month_201303.0',
 'start_month_201308.0',
 'start_month_201309.0',
 'start_month_201310.0',
 'day_of_week_2.0',
 'day_of_week_3.0',
 'day_of_week_6.0']

In [833]:
selected_features_df = new_features_df[selected_f]

In [834]:
X_train, X_test = add_feature_and_df_split(full_sites_vect, selected_features_df, idx_split)

In [835]:
logit = LogisticRegression(C=1.668, random_state=17, solver='liblinear')

In [836]:
%%time
cv_scores = time_cross_val(X_train, y_train, logit)
cv_scores, cv_scores.mean()

Wall time: 57.3 s


(array([0.88044675, 0.88029544, 0.96444604, 0.97337936, 0.92606693,
        0.97172761, 0.9137084 , 0.95841284, 0.90306859, 0.96464815]),
 0.9336200127221399)

In [837]:
c_values = np.logspace(-2, 2, 10)
time_split = TimeSeriesSplit(n_splits=10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=1, cv=time_split, verbose=1)

In [838]:
%%time
logit_grid_searcher.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 11.4min finished


Wall time: 11min 29s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.668, class_weight=None,
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=17, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=1,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_sco

In [839]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_, logit_grid_searcher.best_estimator_

(0.9358787473542577,
 {'C': 0.21544346900318834},
 LogisticRegression(C=0.21544346900318834, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False))

In [840]:
logit_best = logit_grid_searcher.best_estimator_
y_test = logit_best.predict_proba(X_test)[:, 1]

# Write it to the file which could be submitted
write_to_submission_file(y_test, '17___baseline_1.csv')