In [1]:
# Import libraries and set desired options
import pickle
from pathlib2 import Path
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
%config InlineBackend.figure_format = 'retina'

In [2]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.9):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver='liblinear').fit(X[:idx, :], y[:idx])
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [4]:
def time_cross_val(X_train, y_train, estimator, n_splits=10):
    time_split = TimeSeriesSplit(n_splits=n_splits)
    cv_scores = cross_val_score(estimator, X_train, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=1)
    return cv_scores

In [16]:
def new_features(df_in):
    df = pd.DataFrame()
    df['morning'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 7 and ts.hour <= 11 else 0)
    df['day'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 12 and ts.hour <= 18 else 0)
    df['evening'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 19 and ts.hour <= 23 else 0)
    df['night'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 0 and ts.hour <= 6 else 0)
    
    df['start_month'] = df_in['time1'].apply(lambda ts: 100 * ts.year + ts.month).astype('float64')
    df['start_month'] = StandardScaler().fit_transform(df[['start_month']])
    
    df['day_of_week'] = df_in['time1'].apply(lambda ts: ts.dayofweek).astype('float64')
    df = pd.get_dummies(df, columns=['day_of_week'])
    
    df['session_lenth'] = (full_times.max(axis=1) - full_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
    df['session_lenth'] = StandardScaler().fit_transform(df['session_lenth'].values.reshape(-1, 1))
     
    return df

In [6]:
def add_feature_and_df_split(base_sparce_df, new_feature_df, split:int):
    X_train = hstack([base_sparce_df[:split,:], new_feature_df[:split]])
    X_test = hstack([base_sparce_df[split:,:], new_feature_df[split:]])
    return X_train, X_test

In [7]:
def delta_times(times_df):
    delta_time_df = pd.DataFrame()
    count_of_colums = len(times_df.columns)
    for i in range(1, count_of_colums):
        column_name = f'd_time{i}'
        delta_time_df[column_name] = times_df.apply(lambda ts_columns: (ts_columns[f'time{i+1}'] - ts_columns[f'time{i}']).total_seconds() if not pd.isnull(ts_columns[f'time{i+1}']) else 0, axis=1)
        delta_time_df[column_name] = StandardScaler().fit_transform(delta_time_df[column_name].values.reshape(-1, 1))
    return delta_time_df

In [8]:
# Read the training and test data sets, change paths if needed
PATH_TO_DATA = Path(r'D:\Programming\DS\mlcourse\course\jupyter_english\assignments_fall2019\Alice')

times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv(PATH_TO_DATA / 'train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv(PATH_TO_DATA / 'test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

In [9]:
# United dataframe of the initial data 
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [10]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, 11)]
full_df[sites] = full_df[sites].fillna(0).astype(np.uint16)

# Load websites dictionary
with open(PATH_TO_DATA / 'site_dic.pkl', "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), 
                          columns=['site'])

In [11]:
# create an inverse id _> site mapping
id2site = {v:k for (k, v) in site_dict.items()}
# we treat site with id 0 as "unknown"
id2site[0] = 'unknown'

In [12]:
full_sites = full_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                 ' '.join([id2site[i] for i in row]), axis=1).tolist()

In [13]:
# we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
# so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
# to be split into 'mail', 'google' and 'com')

vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}

vectorizer = TfidfVectorizer(**vectorizer_params)
full_sites_vect = vectorizer.fit_transform(full_sites)
y_train = train_df['target'].astype('int').values

In [14]:
# we'll need site visit times for further feature engineering
full_times = full_df[times]

In [17]:
new_features_df = new_features(full_df)

In [18]:
X_train, X_test = add_feature_and_df_split(full_sites_vect, new_features_df, idx_split)

In [37]:
logit = LogisticRegression(C=1.668, random_state=17, solver='liblinear')

In [38]:
%%time
cv_scores = time_cross_val(X_train, y_train, logit)
cv_scores, cv_scores.mean()

Wall time: 23.4 s


(array([0.81538376, 0.87141117, 0.87554055, 0.97624506, 0.93791066,
        0.97252473, 0.92442347, 0.95966785, 0.7721333 , 0.96758319]),
 0.9072823744783095)

In [33]:
from sklearn.neural_network import MLPClassifier

NN = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(7,5,3,), random_state=17)

In [34]:
%%time
cv_scores = time_cross_val(X_train, y_train, NN)
cv_scores, cv_scores.mean()

Wall time: 1h 33min 45s


(array([0.81870963, 0.78065961, 0.80266712, 0.97692241, 0.87050803,
        0.9138019 , 0.9162999 , 0.93049019, 0.56418961, 0.85900021]),
 0.843324860270401)

In [39]:
from sklearn import svm
SVM = svm.LinearSVC(random_state=17)

In [40]:
%%time
cv_scores = time_cross_val(X_train, y_train, SVM)
cv_scores, cv_scores.mean()

Wall time: 15.7 s


(array([0.80630067, 0.85105696, 0.84768611, 0.98543051, 0.92968396,
        0.96146408, 0.9372008 , 0.95106432, 0.72142143, 0.9531153 ]),
 0.8944424129186173)

In [41]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=17)

In [42]:
%%time
cv_scores = time_cross_val(X_train, y_train, RF)
cv_scores, cv_scores.mean()

Wall time: 60 s


(array([0.74509985, 0.72882743, 0.83835484, 0.93183427, 0.85140338,
        0.89973064, 0.93029657, 0.92789995, 0.86043524, 0.88407482]),
 0.8597956990103931)

In [43]:
from sklearn import linear_model
SGDC = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, loss='log', random_state=17)

In [44]:
%%time
cv_scores = time_cross_val(X_train, y_train, SGDC)
cv_scores, cv_scores.mean()

Wall time: 3.15 s


(array([0.78204053, 0.86988416, 0.89456299, 0.93114498, 0.92993962,
        0.9567261 , 0.88629256, 0.94955991, 0.71773593, 0.94156989]),
 0.8859456684179297)

In [22]:
import xgboost as xgb
XGB_reg = xgb.XGBRegressor(10, 0.1, 1000, objective= 'binary:logistic', random_state = 17, booster = 'gblinear',scale_pos_weight = 109)

In [23]:
%%time
cv_scores = time_cross_val(X_train, y_train, XGB_reg)
cv_scores, cv_scores.mean()

Wall time: 13min 44s


(array([0.76879199, 0.80577042, 0.77758492, 0.74046828, 0.86718052,
        0.85215601, 0.88196134, 0.93701612, 0.7507068 , 0.88289932]),
 0.8264535713552238)

In [24]:
XGB_clf = xgb.XGBClassifier (random_state = 17, booster = 'gblinear')

In [26]:
%%time
cv_scores = time_cross_val(X_train, y_train, XGB_clf)
cv_scores, cv_scores.mean()

Wall time: 1min 33s


(array([0.73739678, 0.79940628, 0.79207199, 0.73970301, 0.86271045,
        0.83918773, 0.89495261, 0.92673776, 0.74520682, 0.87186432]),
 0.8209237754716991)

In [None]:
from sklearn.ensemble import VotingClassifier

clf1 = logit
clf4 = NN
clf2 = RF
clf3 = SGDC

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('sgd', clf3)], voting='soft') 

In [None]:
%%time
cv_scores = time_cross_val(X_train, y_train, eclf)
cv_scores, cv_scores.mean()