In [1]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

import re
import pickle
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, make_scorer
from vowpalwabbit.sklearn_vw import VWClassifier, VW
import itertools
from sklearn.decomposition import NMF, TruncatedSVD

In [2]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from scipy.sparse import csr_matrix, hstack

In [77]:
def sparsematrix(X):
    row = []
    col = []
    data = []
    for r in range(X.shape[0]):
        row_counter = Counter(X[r])
        for site, num in row_counter.items():
            row.append(r)
            col.append(site)
            data.append(num)
    print "Sparse Matrix - rows:", X.shape[0], "columns:", len(set(col))
    return csr_matrix((data, (row, col)), shape=(X.shape[0], len(set(col))))[:,1:]


def sites_to_sparse_tfidf(train_data, test_data, target_col, session_length, label_encoder=False):
    train_test_df = pd.concat([train_data, test_data])
    train_index_full = list(train_data.index)
    train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]].index)
    test_index_full = list(test_data.index)
    test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]].index)
    train_duplicates_mask = np.transpose([np.in1d(train_index_full, train_index_dup).astype(int)])
    test_duplicates_mask = np.transpose([np.in1d(test_index_full, test_index_dup).astype(int)])

    y = train_data[target_col]

    train_test_df_sites = train_test_df[['site' + str(c) for c in range(1,10+1)]].fillna(0).astype('int')
    train_test_df_sites_array = [" ".join(["s_"+str(s) for s in train_test_df_sites.as_matrix()[i] if int(s) != 0]) \
                                                                  for i in range(train_test_df_sites.shape[0])]

    tfidf = TfidfVectorizer(max_df=0.9).fit(train_test_df_sites_array) #TfidfVectorizer()
    X_train_test_sparse = tfidf.transform(train_test_df_sites_array)

    X_train_sparse = X_train_test_sparse[:len(train_data)]
    X_test_sparse = X_train_test_sparse[len(train_data):]
    
    sites_columns_num = X_train_test_sparse.shape[1]
    
    y_for_vw = None
    class_encoder = None
    if label_encoder:
        class_encoder = LabelEncoder().fit(y.astype('str'))
        y_for_vw = class_encoder.transform(y.astype('str')) + 1
    
    return [X_train_sparse, X_test_sparse, y, y_for_vw, sites_columns_num, class_encoder, tfidf, \
             train_duplicates_mask, test_duplicates_mask]


def features_to_sparse(train_data, test_data, feature_cols):
    features_matrix = []
    for df in [train_data, test_data]:
        num_cols = 0
        data = []
        rows = []
        cols = []
        for label in feature_cols:
            if label in ["day_of_week", "daytime"]:
                coldata = list(df[[label]].values.T[0].astype('float') + 1)
            else:
                coldata = list(df[[label]].values.T[0].astype('float'))
            if len(data):
                data += coldata
            else:
                data = list(coldata)
            if len(cols):
                cols += [num_cols] * len(coldata)
            else:
                cols = [num_cols] * len(coldata)
            num_cols += 1
        rows = [r for r in range(df.shape[0])] * num_cols
        features = csr_matrix((data, (rows, cols)), shape=(df.shape[0], num_cols), dtype=float)
        features_matrix.append(features)
    return features_matrix


def calc_site_times_portions(train_data, test_data):
    site_times = [{},{}]
    count = 0
    for data in [train_data, test_data]:
        for r, row in data[:][range(0, 10)+range(20,30)].iterrows():
            rowdic = {}
            for c, s in [[c, 'site' + str(c)] for c in range(1,10)]:
                if row[s] == 0:
                    continue
                if row[s] in rowdic:
                    rowdic[int(row[s])] += row["time_diff"+str(c)]
                else:
                    rowdic[int(row[s])] = row["time_diff"+str(c)]
            site_times[count][r] = {}
            for site, time in rowdic.items():
                if len(rowdic) == 1:
                    site_times[count][r][int(site)] = time #1.0
                    continue
                if time > 0:
                    #site_times[count][r][int(site)] = round(float(time)/row["session_timespan"],3)
                    site_times[count][r][int(site)] = time
        count+=1
    return site_times

def site_times_to_sparse(sitetimes):
    row = []
    col = []
    data = []
    rowcount = 0
    for sitetime in sitetimes:
        for r, sites in sitetime.items():
            for site, p in sites.items():
                col.append(site)
                row.append(rowcount)
                data.append(p)
            rowcount+=1
    site_times_sparse = csr_matrix((data, (row, col)), shape=(len(sitetimes[0])+len(sitetimes[1]), max(col)+1), \
                                                                                              dtype=float)[:,1:]
    return site_times_sparse



def combine_sites_features_sparse(sites_train_sparse, features_train_sparse, \
                                  sites_test_sparse, features_test_sparse, \
                                  train_preds_sparse, test_preds_sparse,
                                  train_duplicates_mask = None, test_duplicates_mask = None, \
                                  train_site_times_sparse = None, test_site_times_sparse = None, \
                                train_sites_sequence=None, test_sites_sequence=None):
    if train_site_times_sparse is not None and test_site_times_sparse is not None:
        X_train_sparse = hstack([sites_train_sparse, features_train_sparse, train_preds_sparse,\
                                 train_site_times_sparse, train_sites_sequence], dtype=float).tocsr()
        X_test_sparse = hstack([sites_test_sparse, features_test_sparse, test_preds_sparse,\
                                test_site_times_sparse, test_sites_sequence], dtype=float).tocsr()
    else:
        X_train_sparse = hstack([sites_train_sparse, features_train_sparse], dtype=float).tocsr()
        X_test_sparse = hstack([sites_test_sparse, features_test_sparse], dtype=float).tocsr()
        
    X_train_sparse = hstack([X_train_sparse, train_duplicates_mask], dtype=float).tocsr()
    X_test_sparse = hstack([X_test_sparse, test_duplicates_mask], dtype=float).tocsr() 
    return [X_train_sparse, X_test_sparse]


def sparse_matrix_to_vw(X_sparse_full, sites_columns_num, vocabulary, y=None, weights=None, mark_duplicates=False):
    sessions = {}
    used = {}
    prediction = {}
    day_of_week = {}
    start_hour = {}
    daytime = {}
    unique_sites = {}
    top30_portion = {}
    fb_portion = {}
    youtube_portion = {}
    bot30_portion = {}
    site_longest_time = {}
    session_timespan = {}
    sitetimes = {}
    sequence = {}
    
    X_sparse = X_sparse_full[:,:-1]
    
    add_features = True

    for r, c in zip(X_sparse.nonzero()[0], X_sparse.nonzero()[1]):
        if tuple([r,c]) not in used:
            used[tuple([r, c])] = 1
            if add_features:
                if c >= X_sparse.shape[1] - sites_columns_num - 10 - 550 and \
                    c < X_sparse.shape[1] - sites_columns_num - 10:
                #if c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("prediction") - 10:
                    prediction[r] = " |aprediction {}:{}".format(int(c - sites_columns_num - len(mycolumns)+1), int(X_sparse[r,c]))
                    #prediction[r] = " |prediction:100 {}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("day_of_week") - 10 - 550:
                    day_of_week[r] = " |bday_of_week {}".format(int(X_sparse[r,c]))
                    #day_of_week[r] = " day_of_week:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("start_hour") - 10 - 550:
                    start_hour[r] = " |chour_start {}".format(int(X_sparse[r,c]))
                    #start_hour[r] = " start_hour:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("daytime") - 10 - 550:
                    daytime[r] = " |dtime_of_day {}".format(int(X_sparse[r,c]))
                    #daytime[r] = " daytime:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("session_timespan") - 10 - 550:
                    session_timespan[r] = " |jsession_timespan time:{}".format(int(X_sparse[r,c]))
                    #session_timespan[r] = " session_timespan:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("#unique_sites") - 10 - 550:
                    unique_sites[r] = " unique_sites:{}".format(int(X_sparse[r,c]))
                    #unique_sites[r] = " unique_sites:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("site_longest_time") - 10 - 550:
                    site_longest_time[r] = " |hsite_longest_time {}:{}".format(int(X_sparse[r,c]), 3)
                    #site_longest_time[r] = " site_longest_time:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("top30_portion") - 10 - 550:
                    top30_portion[r] = " top30:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("bot30_portion") - 10 - 550:
                    bot30_portion[r] = " bot30:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("fb_portion") - 10 - 550:
                    fb_portion[r] = " facebook:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("youtube_portion") - 10 - 550:
                    youtube_portion[r] = " youtube:{}".format(X_sparse[r,c])
                    continue
                elif c >= X_sparse.shape[1] - 10:
                    if r not in sequence:
                        sequence[r] = " |ksequence " + \
                            ' '.join(filter(lambda a: a != "0", X_sparse[r,-10:].todense().astype(int).astype(str).tolist()[0]))
                    continue
                    
            if c < sites_columns_num: #X_sparse.shape[1] - len(mycolumns): 
                if r in sessions:
                    sessions[r] += " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
                else:
                    if y is not None:
                        if int(X_sparse_full[r, -1]) and mark_duplicates: # duplicate row indicator
                            sessions[r] = str(y[r]) + ' 0.3' + ' |site' + " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
                        else:
                            if weights is not None:
                                sessions[r] = str(y[r]) + ' ' + str(weights[y[r]-1]) + ' |site' + " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
                            else:
                                sessions[r] = str(y[r]) + ' |site' + " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
                    else:
                        sessions[r] = ' |site' + " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
            elif c >= X_sparse.shape[1] - sites_columns_num - 10 and c < X_sparse.shape[1] - 10:
                if r in sitetimes:
                    sitetimes[r] += " {}:{}".format(int(c - sites_columns_num - len(mycolumns) - 550 +1), float(X_sparse[r,c]))
                else:
                    sitetimes[r] = ' |isitetime' + " {}:{}".format(int(c - sites_columns_num - len(mycolumns) - 550 +1), float(X_sparse[r,c]))
        
    
    return {"sites": sessions, "prediction": prediction, "day_of_week": day_of_week, \
                      "start_hour": start_hour, "daytime": daytime, \
                     "unique_site": unique_sites, "top30_portion": top30_portion, \
                    "bot30_portion": bot30_portion, "fb_portion": fb_portion, \
                    "youtube_portion": youtube_portion, "site_longest_time": site_longest_time, \
                    "session_timespan": session_timespan, "sitetimes": sitetimes, "sequence": sequence}



def vw_to_file(sites, out_file, features={}, quiet=True):   
    vw_writer = open(out_file, 'w')
    final_vw = {}
    gen_features = []
    
    if not quiet:
        print "Features:", features.keys()
        
    for r in sorted(sites.keys()):
        final_vw[r] = sites[r] #+ " |features"
        for fname, feature in features.items():
            if fname in ["youtube_portion", "fb_portion", "top30_portion", "bot30_portion", \
                                         "unique_sites"] and r in feature:
                gen_features.append(feature[r])
                continue
            if r in feature:
                final_vw[r] += feature[r]        
            
        if len(gen_features):
            final_vw[r] += " |features"
            for gf in gen_features:
                final_vw[r] += gf
        gen_features = []
        
        #if "prediction" in features and r in features["prediction"]:
            #final_vw[r] += features["prediction"][r]
        
        vw_writer.write(final_vw[r] + "\n")
        
    vw_writer.close()
    
    
def write_to_submission_file(predicted_labels, out_file,
                             target='user_id', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)



In [78]:
def calc_predictions(train_data, test_data, site_dic, user_dic, min_users, max_users, permutations=False):
    train_row_users = {}
    test_row_users = {}
    
    sites_cols = ['site' + str(c) for c in range(1,10+1)]
    
    # Add predictions from the dataframe (based on uniquely visited site)
    for r, v in train_data[["prediction"]].iterrows():
        if int(v) != 0:
            train_row_users[r] = {int(v): 1}  
    
    for r, v in test_data[["prediction"]].iterrows():
        if int(v) != 0:
            test_row_users[r] = {int(v): 1}
    
    # Add predictions if a website in session was visited by less than num_users_for_prediction
    for r, row in train_data[sites_cols+["target"]].iterrows():
        if r in train_row_users:
            continue
        session_predictions = {}
        for site in row:
            predictions = set([])
            if site in site_dic and site in user_dic[int(row["target"])] \
                          and len(site_dic[site]) in range(min_users, max_users+1):
                predictions = set(site_dic[site])
            if len(predictions):
                for puser in predictions:
                    if puser in session_predictions:
                        session_predictions[puser] +=1
                    else:
                        session_predictions[puser] = 1
                #session_predictions |= predictions
        if len(session_predictions):
            train_row_users[r] = session_predictions
    
    
    for r, row in test_data[sites_cols].iterrows():
        if r in test_row_users:
            continue
        session_predictions = {}
        for site in row:
            predictions = set([])
            if site in site_dic and len(site_dic[site]) in range(min_users, max_users):
                predictions = set(site_dic[site])
            if len(predictions):
                for puser in predictions:
                    if puser in session_predictions:
                        session_predictions[puser] +=1
                    else:
                        session_predictions[puser] = 1
                #session_predictions |= predictions
        if len(session_predictions):
            test_row_users[r] = session_predictions
    
    if not permutations:
        return train_row_users, test_row_users
    
    #Identify sessions with identical sites sequence
    train_index_full = list(train_data.index)
    train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]].index)

    test_index_full = list(test_data.index)
    test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]].index)
    
    train_user_dup_rows_dict = {}
    train_dup_row_users_dict = {}

    #test_dup_rows_dict = {} 

    
    
    for r, row in train_data.ix[train_index_dup][sites_cols+["target"]].iterrows():
        if row["target"] in train_user_dup_rows_dict:
            if tuple(row[sites_cols]) in train_user_dup_rows_dict[row["target"]]:
                train_user_dup_rows_dict[row["target"]][tuple(row[sites_cols])] += 1
            else:
                train_user_dup_rows_dict[row["target"]][tuple(row[sites_cols])] = 1 
        else:
            train_user_dup_rows_dict[row["target"]] = {tuple(row[sites_cols]): 1}

        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            train_dup_row_users_dict[tuple(row[sites_cols])].add(row["target"])
        else:
            train_dup_row_users_dict[tuple(row[sites_cols])] = set([row["target"]])
    
    # Make predictions based on duplicate sessions
    for r, row in train_data.ix[train_index_dup][sites_cols].iterrows():        
        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            if r in train_row_users:
                pass #don't overwright predictions from the dataframe
                #train_row_users[r] += train_dup_row_users_dict[tuple(row[sites_cols])]
            else:
                train_row_users[r] = train_dup_row_users_dict[tuple(row[sites_cols])]
    
    for r, row in test_data.ix[test_index_dup][sites_cols].iterrows():  
        #if tuple(row[sites_cols]) in test_dup_rows_dict:
            #test_dup_rows_dict[tuple(row[sites_cols])] += 1
        #else:
            #test_dup_rows_dict[tuple(row[sites_cols])] = 1

        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            if r in test_row_users:
                pass #don't overwright predictions from the dataframe
                #test_row_users[r] += train_dup_row_users_dict[tuple(row[sites_cols])]
            else:
                test_row_users[r] = train_dup_row_users_dict[tuple(row[sites_cols])]
    

    
    
    # Find users who visited 2, 3, 4 websites
    site_pairs = {}
    for r, row in train_data[sites_cols+["target"]].iterrows():
        unique_sites = Counter(row).keys()
        if 0 in unique_sites:
            del unique_sites[unique_sites.index(0)]
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
        #if len(unique_sites) > 3:
            #for subset in itertools.permutations(Counter(row).keys(), 4):
                #if tuple(subset) in site_pairs:
                    #site_pairs[tuple(subset)].add(row["target"])
                #else:
                    #site_pairs[tuple(subset)] = set([row["target"]])
    
    # Add predictions to train data based on 2 visited websites
    for r, row in train_data[sites_cols+["target"]].iterrows():
        if r in train_row_users:
            continue
        unique_sites = Counter(row).keys()
        if 0 in unique_sites:
            del unique_sites[unique_sites.index(0)]
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if tuple(subset) in site_pairs:
                    if r in train_row_users:
                        train_row_users[r] |= site_pairs[subset]
                    else:
                        train_row_users[r] = set(site_pairs[subset])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if tuple(subset) in site_pairs:
                    if r in test_row_users:
                        train_row_users[r] |= site_pairs[subset]
                    else:
                        train_row_users[r] = set(site_pairs[subset])
        #if len(unique_sites) > 3:
            #for subset in itertools.permutations(Counter(row).keys(), 4):
                #if tuple(subset) in site_pairs:
                    #if r in test_row_users:
                        #train_row_users[r].add(site_pairs[subset])
                    #else:
                        #train_row_users[r] = set(site_pairs[subset])
    
    # Add predictions to test data based on 2 visited websites
    for r, row in test_data[sites_cols].iterrows():
        if r in test_row_users:
            continue
        unique_sites = Counter(row).keys()
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if subset in site_pairs:
                    if r in test_row_users:
                        test_row_users[r] |= site_pairs[subset]
                    else:
                        test_row_users[r] = set(site_pairs[subset])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if subset in site_pairs:
                    if r in test_row_users:
                        test_row_users[r] |= site_pairs[subset]
                    else:
                        test_row_users[r] = set(site_pairs[subset])
        #if len(unique_sites) > 3:
            #for subset in itertools.permutations(Counter(row).keys(), 4):
                #if subset in site_pairs:
                    #if r in test_row_users:
                        #test_row_users[r].add(site_pairs[subset])
                    #else:
                        #test_row_users[r] = set(site_pairs[subset])
        
    
    
    return train_row_users, test_row_users

In [5]:
def predictions_to_vw(predictions):
    new_pred = {}
    
    for row, pred in [[k, v.items()] for k, v in predictions.items() if len(v) ==2]:
        if pred[0][1] != pred[1][1]:
            print "Predictions probabilities are not equal! Breaking!", pred
            break
        new_pred[row] = " |aprediction " + str(pred[0][0]) + ":0.5" + " " + str(pred[1][0]) + ":0.5"
    
    ###################
    for row, pred in [[k, v.items()] for k, v in predictions.items() if len(v) ==3]:
        a = pred[0][1]
        b = pred[1][1]
        c = pred[2][1]

        if a == b and b==c:
            new_pred[row] = " |aprediction " + str(pred[0][0]) + ":0.33" + " " + str(pred[1][0]) + ":0.33" + \
                                                                            " " + str(pred[2][0]) + ":0.33"
        else:
            sorted_preds = sorted(pred, key= lambda t: t[1], reverse=True)
            a = sorted_preds[0][1]
            b = sorted_preds[1][1]
            if a == b:
                new_pred[row] = " |aprediction " + str(sorted_preds[0][0]) + ":0.5" + " " + \
                                                        str(sorted_preds[1][0]) + ":0.5"
            else:
                new_pred[row] = " |aprediction " + str(sorted_preds[0][0]) + ":1"      
    
    
    #####################
    for row, pred in [[k, v.items()] for k, v in predictions.items() if len(v) ==4]:
        a = pred[0][1]
        b = pred[1][1]
        c = pred[2][1]
        d = pred[3][1]

        if a == b and b==c and c==d:
            new_pred[row] = " |aprediction " + str(pred[0][0]) + ":0.25" + " " + str(pred[1][0]) + ":0.25" + \
                                       " " + str(pred[2][0]) + ":0.25" + " " + str(pred[3][0]) + ":0.25"
        else:
            sorted_preds = sorted(pred, key= lambda t: t[1], reverse=True)
            a = sorted_preds[0][1]
            b = sorted_preds[1][1]
            c = sorted_preds[2][1]
            if a == b and b==c:
                new_pred[row] = " |aprediction " + str(sorted_preds[0][0]) + ":0.33" + " " + \
                                           str(sorted_preds[1][0]) + ":0.33" + " " + str(sorted_preds[2][0]) + ":0.33"
            else:
                sorted_preds2 = sorted(sorted_preds, key= lambda t: t[1], reverse=True)
                a = sorted_preds2[0][1]
                b = sorted_preds2[1][1]
                if a == b:
                    new_pred[row] = " |aprediction " + str(sorted_preds2[0][0]) + ":0.5" + " " + \
                                                        str(sorted_preds2[1][0]) + ":0.5"
                else:
                    new_pred[row] = " |aprediction " + str(sorted_preds2[0][0]) + ":1"
    
    return new_pred

In [6]:
def create_user_site_dic(train_data, site_freq_pkl):
    user_dic = {}
    site_dic = {}

    pkl_file = open(site_freq_pkl, 'rb')
    site_freq = pickle.load(pkl_file)
    #top_sites = [v[1] for k, v in sorted(site_freq.items(), key=lambda t: t[1][1], reverse=True)[:0]]
    
    for i, v in train_data.iterrows():
        if v.target not in user_dic:
            user_dic[v.target] = {}
        for site in ['site' + str(i) for i in range(1,11)]:
            if int(v[site]) != 0: #and v[site] not in top_sites:
                if v[site] in user_dic[v.target]:
                    user_dic[v.target][v[site]] +=1
                else:
                    user_dic[v.target][v[site]] = 1

                if v[site] in site_dic:
                    site_dic[v[site]].add(v.target)
                else:
                    site_dic[v[site]] = set([v.target])
    
    return user_dic, site_dic

In [7]:
def text_classifier(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
        )

# Let's Start

In [95]:
%%time
train_data = pd.read_csv('kaggle_data/full_train_w8.csv')
test_data = pd.read_csv('kaggle_data/full_test.csv')

train_site_sequence = csr_matrix(train_data[['site' + str(c) for c in range(1,10+1)]].as_matrix(), dtype=int)
test_site_sequence = csr_matrix(test_data[['site' + str(c) for c in range(1,10+1)]].as_matrix(), dtype=int)

#test_predictions = calc_predictions(train_data, test_data)

# Additionally, let's calculate the percentage of session time spent by every site in session
site_times = calc_site_times_portions(train_data, test_data)

# Convert site times to sparse format
site_times_sparse = site_times_to_sparse(site_times)
train_site_times_sparse = site_times_sparse[:len(train_data)]
test_site_times_sparse = site_times_sparse[len(train_data):]

CPU times: user 40.4 s, sys: 532 ms, total: 41 s
Wall time: 42 s


In [96]:
%%time
user_dic, site_dic = create_user_site_dic(train_data, "kaggle_data/site_freq.pkl")

CPU times: user 1min 9s, sys: 348 ms, total: 1min 9s
Wall time: 1min 8s


In [97]:
%%time
train_predictions, test_predictions = calc_predictions(train_data, test_data, \
                                                       site_dic, user_dic, 1, 1)

CPU times: user 22 s, sys: 196 ms, total: 22.2 s
Wall time: 22 s


In [98]:
%%time
train_preds = {}
for k, v in train_predictions.items():
    train_preds[k] = {}
    for user, count in v.items():
        if count not in train_preds[k]:
            train_preds[k][count] = [user]
        else:
            train_preds[k][count].append(user)
    train_preds[k] = train_preds[k][np.max(train_preds[k].keys())]

CPU times: user 332 ms, sys: 4 ms, total: 336 ms
Wall time: 334 ms


In [99]:
%%time
test_preds = {}
for k, v in test_predictions.items():
    test_preds[k] = {}
    for user, count in v.items():
        if count not in test_preds[k]:
            test_preds[k][count] = [user]
        else:
            test_preds[k][count].append(user)
    test_preds[k] = test_preds[k][np.max(test_preds[k].keys())]

CPU times: user 96 ms, sys: 8 ms, total: 104 ms
Wall time: 95.2 ms


In [100]:
len(train_preds.keys())

22621

In [17]:
for r, pred in [[k, v[0]] for k, v in train_preds.items() if len(v)==1]:
    train_data.set_value(r, -2, pred, takeable=True)

In [28]:
for r, pred in [[k, v[0]] for k, v in test_preds.items() if len(v)==1]:
    test_data.set_value(r, -1, pred, takeable=True)

In [29]:
len([[k, v] for k, v in test_preds.items() if len(v)==1])

11754

In [101]:
len(train_data[train_data.prediction > 0])

22621

In [19]:
len(test_data)

41177

In [21]:
#train_add_predictions = predictions_to_vw(train_predictions)
#test_add_predictions = predictions_to_vw(test_predictions)

In [102]:
%%time
train_test_df = pd.concat([train_data, test_data])
train_index_full = list(train_data.index)
session_length = 10
train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                       [['site' + str(c) for c in range(1,10+1)]+["target"]].index)
test_index_full = list(test_data.index)
test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                       [['site' + str(c) for c in range(1,10+1)]].index)
train_duplicates_mask = np.transpose([np.in1d(train_index_full, train_index_dup).astype(int)])
test_duplicates_mask = np.transpose([np.in1d(test_index_full, test_index_dup).astype(int)])

y = train_data["target"]

train_test_df_sites = train_test_df[['site' + str(c) for c in range(1,10+1)]].fillna(0).astype('int')
train_test_df_sites_array = [" ".join(["s_"+str(s) for s in train_test_df_sites.as_matrix()[i] if int(s) != 0]) \
                                                              for i in range(train_test_df_sites.shape[0])]

tfidf = TfidfVectorizer(analyzer=str.split, max_df=0.95, ngram_range=(1,3)).fit(train_test_df_sites_array) #TfidfVectorizer()
X_train_test_sparse = tfidf.transform(train_test_df_sites_array)
#X_train_test_sparse = TruncatedSVD(n_components=10000).fit_transform(X_train_test_sparse)

X_train_sparse = X_train_test_sparse[:len(train_data)]
X_test_sparse = X_train_test_sparse[len(train_data):]

class_encoder = LabelEncoder().fit(y.astype('str'))
y_for_vw = class_encoder.transform(y.astype('str')) + 1

sites_columns_num = X_train_test_sparse.shape[1]
inv_vocabulary = {v: int(re.search("s_(\d+)$", k).group(1)) for k, v in tfidf.vocabulary_.iteritems()}

#y_weights = [(np.sum(Counter(y_for_vw).values()) - v + min((Counter(y_for_vw).values())))/ \
            #float(np.sum(Counter(y_for_vw).values())) for k, v in sorted(Counter(y_for_vw).items())]

#y_weights = [round(np.max(Counter(y_for_vw).values())/float(v), 3) for k, v in sorted(Counter(y_for_vw).items())]
y_weights = [1]*550

CPU times: user 4.99 s, sys: 92 ms, total: 5.08 s
Wall time: 5.08 s


In [103]:
row = []
col = []
data = []
for r, p in train_preds.items():
    if len(p) == 1:
        row.append(r)
        col.append(np.array(class_encoder.transform([str(p[0])])+1)[0])
        data.append(1)
print max(col)
train_preds_sparse = csr_matrix((data, (row, col)), shape=(train_data.shape[0], 551))[:,1:]

row = []
col = []
data = []
for r, p in test_preds.items():
    if len(p) == 1:
        row.append(r)
        col.append(np.array(class_encoder.transform([str(p[0])])+1)[0])
        data.append(1)
test_preds_sparse = csr_matrix((data, (row, col)), shape=(test_data.shape[0], 551))[:,1:]

550


In [56]:
np.array(class_encoder.transform([str(280)])+1)[0]

315

In [104]:
%%time
#X_train_sparse, X_test_sparse, y, y_for_vw, sites_columns_num, class_encoder, tfidf, train_duplicates_mask, test_duplicates_mask = \
    #sites_to_sparse_tfidf(train_data, test_data, "target", 10, label_encoder=LabelEncoder())

mycolumns = [label for label in test_data[range(20, test_data.shape[1])]]

train_features, test_features = features_to_sparse(train_data, test_data, mycolumns)

X_train_sparse, X_test_sparse = combine_sites_features_sparse(X_train_sparse, train_features, \
                                                             X_test_sparse, test_features, \
                                                              train_preds_sparse, test_preds_sparse,
                                                              train_duplicates_mask, test_duplicates_mask,
                                                              train_site_times_sparse, test_site_times_sparse, \
                                                             train_site_sequence, test_site_sequence)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_sparse, y_for_vw, test_size=0.3, stratify=y_for_vw)

y_train_weights = [(np.sum(Counter(y_train).values()) - v + min((Counter(y_train).values()))) / \
                   float(np.sum(Counter(y_train).values())) for k, v in sorted(Counter(y_train).items())]

#y_train_weights = [round(np.max(Counter(y_train).values())/float(v), 3) for k, v in sorted(Counter(y_train).items())]
y_train_weights = [1] * 550

CPU times: user 41.7 s, sys: 308 ms, total: 42 s
Wall time: 41.9 s


In [105]:
X_train

<76515x48685 sparse matrix of type '<type 'numpy.float64'>'
	with 2451355 stored elements in Compressed Sparse Row format>

In [106]:
sites_columns_num + 20 + 550 + 1 + sites_columns_num + 10

48685

In [107]:
%%time
train_part_vw = sparse_matrix_to_vw(X_train, sites_columns_num, inv_vocabulary, y_train, weights=y_train_weights)
valid_vw = sparse_matrix_to_vw(X_valid, sites_columns_num, inv_vocabulary, y_valid)
train_vw = sparse_matrix_to_vw(X_train_sparse, sites_columns_num, inv_vocabulary, y_for_vw, weights=y_weights)
test_vw = sparse_matrix_to_vw(X_test_sparse, sites_columns_num, inv_vocabulary)

CPU times: user 6min 35s, sys: 544 ms, total: 6min 36s
Wall time: 6min 36s


# Handler and Folder

- a: prediction
- b: day_of_week 
- c: hour_start
- d: time_of_day
- e:
- f: features
- g: 
- h: site_longest_time
- i: sitetimes
- j: session_timespan
- k: sequence

In [108]:
folder = 'kaggle_data/'
handler = '_idf_w8_pred'

keys = ['day_of_week', 'daytime', 'prediction', 'start_hour', 'youtube_portion', 'fb_portion', 'sitetimes', 'sequence']

vw_to_file(train_part_vw["sites"], folder+'train_part'+handler+'.vw', features={x:train_part_vw[x] for x in keys}, quiet=False)
vw_to_file(valid_vw["sites"], folder+'valid'+handler+'.vw', features={x:valid_vw[x] for x in keys}, quiet=False)
vw_to_file(train_vw["sites"], folder+'train'+handler+'.vw', features={x:train_vw[x] for x in keys}, quiet=False)
vw_to_file(test_vw["sites"], folder+'test'+handler+'.vw', features={x:test_vw[x] for x in keys}, quiet=False)

Features: ['youtube_portion', 'sequence', 'sitetimes', 'fb_portion', 'start_hour', 'prediction', 'daytime', 'day_of_week']
Features: ['youtube_portion', 'sequence', 'sitetimes', 'fb_portion', 'start_hour', 'prediction', 'daytime', 'day_of_week']
Features: ['youtube_portion', 'sequence', 'sitetimes', 'fb_portion', 'start_hour', 'prediction', 'daytime', 'day_of_week']
Features: ['youtube_portion', 'sequence', 'sitetimes', 'fb_portion', 'start_hour', 'prediction', 'daytime', 'day_of_week']


In [109]:
f = open(folder+'train_part'+handler+'.vw')
train_part_file = f.readlines()
f.close()

f = open(folder+'train'+handler+'.vw')
train_file = f.readlines()
f.close()

f = open(folder+'valid'+handler+'.vw')
valid_file = f.readlines()
f.close()

f = open(folder+'test'+handler+'.vw')
test_file = f.readlines()
f.close()

In [90]:
skf = StratifiedKFold(n_splits=3, shuffle=True)

In [110]:
%%time
#9
!vw --oaa=550 -d {folder}train_part{handler}.vw \
-f {folder}initial_model{handler}.model -b 27 -c -k \
--passes=5 -l 0.45 --decay_learning_rate=0.9 --l1=1e-11 --l2=1e-11 \
-q "sd" -q "sb" --cubic="sbc" \
--keep "s" --keep "b" --keep "c" --keep "d" --keep "a"

!vw -i {folder}initial_model{handler}.model  -t -d {folder}valid{handler}.vw \
-p {folder}vw_valid_pred{handler}.csv --quiet

vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
accuracy = accuracy_score(y_valid, vw_valid_pred.values)
print "Accuracy:", accuracy

creating quadratic features for pairs: sd sb 
creating cubic features for triples: sbc 
using namespaces beginning with: s b c d a 
using l1 regularization = 1e-11
using l2 regularization = 1e-11
final_regressor = kaggle_data/initial_model_idf_w8_pred.model
Num weight bits = 27
learning rate = 0.45
initial_t = 0
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = kaggle_data/train_part_idf_w8_pred.vw.cache
Reading datafile = kaggle_data/train_part_idf_w8_pred.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0      340        1       28
1.000000 1.000000            2            2.0      308      340       20
1.000000 1.000000            4            4.0      462      340       16
1.000000 1.000000            8            8.0      386      308       45
1.000000 1.000000           16           16.0      167       14      

In [32]:
%%time
!vw --oaa=550 -d {folder}train_part{handler}.vw \
-f {folder}initial_model{handler}.model -b 29 -c -k \
--passes=30 --decay_learning_rate 0.9 --initial_t 0.002337045080352835 \
-l 0.5416950450219994 \
--power_t 0.5 --loss_function='squared' --l1 1e-11 --l2 1e-11 \
-q "sd" -q "sb" --cubic="sbc"  \
--keep "s" --keep "b" --keep "c" --keep "d" --keep "a" 
#--stage_poly --batch_sz {len(train_part_file)/6} --batch_sz_no_doubling

creating quadratic features for pairs: sd sb 
creating cubic features for triples: sbc 
using namespaces beginning with: s b c d a 
using l1 regularization = 1e-11
using l2 regularization = 1e-11
final_regressor = vw/initial_model_idf_w8_1.model
Num weight bits = 29
learning rate = 0.541695
initial_t = 0.00233705
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = vw/train_part_idf_w8_1.vw.cache
Reading datafile = vw/train_part_idf_w8_1.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            2            2.0      349      509       28
1.000000 1.000000            4            4.0      182      550       20
1.000000 1.000000            9            9.0      318      182       24
1.000000 1.000000           18           17.9      527       94       25
1.000000 1.000000           37           36.9      124       58       36
0.986664 0.973638  

average loss = 0.406958 h

In [34]:
%%time
!vw -i {folder}initial_model{handler}.model  -t -d {folder}valid{handler}.vw \
-p {folder}vw_valid_pred{handler}.csv --quiet

CPU times: user 72 ms, sys: 48 ms, total: 120 ms
Wall time: 3.45 s


In [35]:
vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
accuracy_score(y_valid, vw_valid_pred.values)

0.56859695666758148

weiths1: 0.58872320312261761

valid: 0.56661482633488858 -q "sd" -q "sb" --cubic="sbc"

### Submission

In [52]:
trainvw = open(folder+'train'+handler+'.vw').readlines()
np.random.shuffle(trainvw)
with open(folder+'train'+handler+'.vw', "wb") as f:
    for item in trainvw:
        f.write("%s" % item)

In [53]:
%%time
!vw --oaa=550 -d {folder}train{handler}.vw \
-f {folder}initial_model{handler}.model -b 29 -c -k \
--passes=30 --decay_learning_rate 0.9 --initial_t 0.002337045080352835 \
-l 0.5416950450219994 \
--power_t 0.5 --loss_function='logistic' --l1 1e-11 --l2 1e-11 \
-q "sd" -q "sb" --cubic="sbc"  \
--keep "s" --keep "b" --keep "c" --keep "d" --keep "a" \
--stage_poly --batch_sz {len(train_part_file)/6} --batch_sz_no_doubling

creating quadratic features for pairs: sd sb 
creating cubic features for triples: sbc 
using namespaces beginning with: s b c d a 
using l1 regularization = 1e-11
using l2 regularization = 1e-11
final_regressor = vw/initial_model_idf_w8_1.model
Num weight bits = 29
learning rate = 0.541695
initial_t = 0.00233705
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = vw/train_idf_w8_1.vw.cache
Reading datafile = vw/train_idf_w8_1.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            2            2.0      404      192       16
1.000000 1.000000            4            4.0       14      404       32
1.000000 1.000000            9            9.0      530      404       36
1.000000 1.000000           19           18.9      335      504       34
0.949693 0.901881           39           38.7      109      404       20
0.936493 0.923319           7

average loss = 0.396340 h

In [291]:
%%time
#Prediction on VALID
!vw -i {folder}initial_model{handler}.model  -t -d {folder}valid{handler}.vw \
-p {folder}vw_valid_pred{handler}.csv --quiet

CPU times: user 364 ms, sys: 80 ms, total: 444 ms
Wall time: 20.2 s


In [292]:
vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
accuracy_score(y_valid, vw_valid_pred.values)

0.8428323117738542

In [54]:
%%time
# Prediction on TEST!
!vw -i {folder}initial_model{handler}.model -t -d {folder}test{handler}.vw \
-p {folder}vw_test_pred{handler}.csv


creating quadratic features for pairs: sd sb 
creating cubic features for triples: sbc 
only testing
predictions = vw/vw_test_pred_idf_w8_1.csv
Num weight bits = 29
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = vw/test_idf_w8_1.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0  unknown      469       10
1.000000 1.000000            2            2.0  unknown      517       44
1.000000 1.000000            4            4.0  unknown      168       12
1.000000 1.000000            8            8.0  unknown       24       52
1.000000 1.000000           16           16.0  unknown      328       28
1.000000 1.000000           32           32.0  unknown      460       48
1.000000 1.000000           64           64.0  unknown      514       49
1.000000 1.000000          128          128.0  unknown      

In [55]:
vw_pred = pd.read_csv(folder+'vw_test_pred'+handler+'.csv', header=None)
vw_subm = class_encoder.inverse_transform(vw_pred-1)

In [56]:
write_to_submission_file(vw_subm,
                         folder+'28vw_submission'+handler+'.csv')

In [57]:
!gsutil cp {folder}28vw_submission{handler}.csv gs://smartandnimble/identifyme

Copying file://vw/28vw_submission_idf_w8_1.csv [Content-Type=text/csv]...
/ [1 files][419.8 KiB/419.8 KiB]                                                
Operation completed over 1 objects/419.8 KiB.                                    


Score: 0.57276

# Hyperopt

In [138]:
%%time
def hyperopt_train_test(params):
    with open(folder+'train_part'+handler+'.vw') as f:
        train_part_file = f.readlines()
    
    with open(folder+'valid'+handler+'.vw') as f:
        valid_file = f.readlines()
    
    clas_type = params["type"]
    del params["type"]
    
    if clas_type == "ect":
        model = VW(ect=550, passes=30, b=26, convert_to_vw=False, sort_features=True, **params)
    else:
        model = VW(oaa=550, passes=30, b=26, convert_to_vw=False, sort_features=True, **params)
    
    #skf = StratifiedKFold(n_splits=3, shuffle=True)
    model.fit(train_part_file)
    accuracy = accuracy_score(y_valid, model.predict(valid_file))
    return accuracy
    #return cross_val_score(model, X=train_part_file, y=y_train, cv=skf, scoring=make_scorer(accuracy_score), n_jobs=3).mean()

space4knn = {
    'type': hp.choice('type', ['oaa', 'ect']),
    'l': hp.loguniform('l', -5, 3),
    'initial_t': hp.loguniform('initial_t', -10, 1),
    'power_t': hp.choice('power_t', [0.5, 1]),
    'decay_learning_rate': hp.uniform('decay_learning_rate', 0.001, 1),
    'l2': hp.loguniform('l2', -20, -9),
    'l1': hp.loguniform('l1', -20, -9),
    'loss_function': hp.choice('loss_function', ["logistic", "hinge", "squared"]),
    'ftrl': hp.choice('ftrl', [True, False]),
    'noconstant': hp.choice('noconstant', [True, False]),
    'cubic': hp.choice('cubic', ['sbc', 'ibc']),
    'q': hp.choice('q', ["sb", "sc", "sd", "si"])
}

def f(params):
    print "Testing with params:"
    print params
    acc = hyperopt_train_test(params)
    print "Accuracy:", acc, "\n"
    return {'loss': -acc, 'status': STATUS_OK}

trials_wide_range = Trials()
#trials_wide_range = MongoTrials('mongo://localhost:1234/mydb/jobs', exp_key='exp1')
best = fmin(f, space4knn, algo=tpe.suggest, max_evals=100, trials=trials_wide_range)
print 'best:'
print best

Testing with params:
{'cubic': 'sbc', 'ftrl': True, 'decay_learning_rate': 0.867574166625138, 'initial_t': 0.2776239270739265, 'l': 0.0434341264970275, 'q': 'si', 'power_t': 1, 'noconstant': True, 'l2': 5.248698547405331e-09, 'loss_function': 'squared', 'l1': 1.541908660931064e-08, 'type': 'ect'}
Accuracy: 0.412161131949 

Testing with params:
{'cubic': 'sbc', 'ftrl': True, 'decay_learning_rate': 0.4380919176573933, 'initial_t': 0.08681977356754463, 'l': 0.2790283913381128, 'q': 'sc', 'power_t': 0.5, 'noconstant': True, 'l2': 1.4656938601889154e-08, 'loss_function': 'hinge', 'l1': 2.5435275529929987e-07, 'type': 'oaa'}
Accuracy: 0.528496935322 

Testing with params:
{'cubic': 'ibc', 'ftrl': False, 'decay_learning_rate': 0.10244638809961329, 'initial_t': 1.0419999810170089, 'l': 0.07462834369874273, 'q': 'si', 'power_t': 1, 'noconstant': False, 'l2': 1.1388346589835574e-06, 'loss_function': 'squared', 'l1': 5.803833976036988e-09, 'type': 'oaa'}
Accuracy: 0.289513005824 

Testing with pa

KeyboardInterrupt: 