In [1]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

import re
import pickle
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, make_scorer
from vowpalwabbit.sklearn_vw import VWClassifier, VW
import itertools
from sklearn.decomposition import NMF, TruncatedSVD

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from scipy.sparse import csr_matrix, hstack
import imblearn
from glob import glob

In [4]:
def sparsematrix(X):
    row = []
    col = []
    data = []
    for r in range(X.shape[0]):
        row_counter = Counter(X[r])
        for site, num in row_counter.items():
            row.append(r)
            col.append(site)
            data.append(num)
    print "Sparse Matrix - rows:", X.shape[0], "columns:", len(set(col))
    return csr_matrix((data, (row, col)), shape=(X.shape[0], len(set(col))))[:,1:]


def sites_to_sparse_tfidf(train_data, test_data, target_col, session_length, label_encoder=False):
    train_test_df = pd.concat([train_data, test_data])
    train_index_full = list(train_data.index)
    train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]].index)
    test_index_full = list(test_data.index)
    test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]].index)
    train_duplicates_mask = np.transpose([np.in1d(train_index_full, train_index_dup).astype(int)])
    test_duplicates_mask = np.transpose([np.in1d(test_index_full, test_index_dup).astype(int)])

    y = train_data[target_col]

    train_test_df_sites = train_test_df[['site' + str(c) for c in range(1,10+1)]].fillna(0).astype('int')
    train_test_df_sites_array = [" ".join(["s_"+str(s) for s in train_test_df_sites.as_matrix()[i] if int(s) != 0]) \
                                                                  for i in range(train_test_df_sites.shape[0])]

    tfidf = TfidfVectorizer(max_df=0.9).fit(train_test_df_sites_array) #TfidfVectorizer()
    X_train_test_sparse = tfidf.transform(train_test_df_sites_array)

    X_train_sparse = X_train_test_sparse[:len(train_data)]
    X_test_sparse = X_train_test_sparse[len(train_data):]
    
    sites_columns_num = X_train_test_sparse.shape[1]
    
    y_for_vw = None
    class_encoder = None
    if label_encoder:
        class_encoder = LabelEncoder().fit(y.astype('str'))
        y_for_vw = class_encoder.transform(y.astype('str')) + 1
    
    return [X_train_sparse, X_test_sparse, y, y_for_vw, sites_columns_num, class_encoder, tfidf, \
             train_duplicates_mask, test_duplicates_mask]


def features_to_sparse(train_data, test_data, feature_cols):
    features_matrix = []
    for df in [train_data, test_data]:
        num_cols = 0
        data = []
        rows = []
        cols = []
        for label in feature_cols:
            if label in ["day_of_week", "daytime"]:
                coldata = list(df[[label]].values.T[0].astype('float') + 1)
            else:
                coldata = list(df[[label]].values.T[0].astype('float'))
            if len(data):
                data += coldata
            else:
                data = list(coldata)
            if len(cols):
                cols += [num_cols] * len(coldata)
            else:
                cols = [num_cols] * len(coldata)
            num_cols += 1
        rows = [r for r in range(df.shape[0])] * num_cols
        features = csr_matrix((data, (rows, cols)), shape=(df.shape[0], num_cols), dtype=float)
        features_matrix.append(features)
    return features_matrix


def calc_site_times_portions(train_data, test_data):
    site_times = [{},{}]
    count = 0
    for data in [train_data, test_data]:
        for r, row in data[:][range(0, 10)+range(20,30)].iterrows():
            rowdic = {}
            for c, s in [[c, 'site' + str(c)] for c in range(1,10)]:
                if row[s] == 0:
                    continue
                if row[s] in rowdic:
                    rowdic[int(row[s])] += row["time_diff"+str(c)]
                else:
                    rowdic[int(row[s])] = row["time_diff"+str(c)]
            site_times[count][r] = {}
            for site, time in rowdic.items():
                if len(rowdic) == 1:
                    site_times[count][r][int(site)] = 1.0
                    #site_times[count][r][int(site)] = 30*60
                    continue
                if time > 0:
                    site_times[count][r][int(site)] = round(float(time)/row["session_timespan"],5)
                    #site_times[count][r][int(site)] = int(time)
        count+=1
    return site_times

def calc_site_user_relevance(train_data, test_data, site_dict):
    site_rels = [{}, {}]
    count = 0
    for data in [train_data, test_data]:
        for r, row in data[:][range(0, 10)].iterrows():
            rowdic = {}
            notinsitedict = 0
            for c, s in [[c, 'site' + str(c)] for c in range(1,11)]:
                if row[s] == 0:
                    continue
                if row[s] not in rowdic:
                    if int(row[s]) in site_dict:
                        rowdic[int(row[s])] = round(1./len(site_dict[int(row[s])]), 5)
                    else:
                        notinsitedict += 1
                        rowdic[int(row[s])] = 0
            if len(rowdic) > 0:
                rowdic.update((x, y/(notinsitedict+1)) for x, y in rowdic.items())
                site_rels[count][r] = dict(rowdic)
            else:
                site_rels[count][r] = {}
        count+=1
    return site_rels

def site_rels_to_sparse(siterels):
    row = []
    col = []
    data = []
    rowcount = 0
    for siterel in siterels:
        for r, sites in sorted(siterel.items()):
            if len(sites) < 1:
                continue
            for site, p in sites.items():
                col.append(site)
                row.append(rowcount)
                data.append(p)
            rowcount+=1
    site_times_sparse = csr_matrix((data, (row, col)), shape=(len(siterels[0])+len(siterels[1]), max(col)+1), \
                                                                                              dtype=float)[:,1:]
    return site_times_sparse    
    
    
def site_times_to_sparse(sitetimes):
    row = []
    col = []
    data = []
    rowcount = 0
    for sitetime in sitetimes:
        for r, sites in sorted(sitetime.items()):
            for site, p in sites.items():
                col.append(site)
                row.append(rowcount)
                data.append(p)
            rowcount+=1
    site_times_sparse = csr_matrix((data, (row, col)), shape=(len(sitetimes[0])+len(sitetimes[1]), max(col)+1), \
                                                                                              dtype=float)[:,1:]
    return site_times_sparse


def combine_sites_features_sparse(sites_train_sparse, sites_seq_train_sparse, features_train_sparse, \
                                  sites_test_sparse, sites_seq_test_sparse, features_test_sparse, test_preds_sparse=None,\
                                  train_site_times_sparse = None, test_site_times_sparse = None, \
                                  train_site_rels_sparse = None, test_site_rels_sparse = None, \
                                train_sites_sequence=None, test_sites_sequence=None):
    if train_site_times_sparse is not None and test_site_times_sparse is not None:
        X_train_sparse = hstack([sites_train_sparse, features_train_sparse, train_preds_sparse,\
                                 train_site_times_sparse, train_site_rels_sparse], dtype=float).tocsr()
        X_test_sparse = hstack([sites_test_sparse, features_test_sparse, test_preds_sparse,\
                                test_site_times_sparse, test_site_rels_sparse], dtype=float).tocsr()
    else:
        X_train_sparse = hstack([sites_train_sparse, sites_seq_train_sparse, features_train_sparse], dtype=float).tocsr()
        X_test_sparse = hstack([sites_test_sparse, sites_seq_test_sparse, features_test_sparse], dtype=float).tocsr()
        
    #X_train_sparse = hstack([X_train_sparse, train_duplicates_mask], dtype=float).tocsr()
    #X_test_sparse = hstack([X_test_sparse, test_duplicates_mask], dtype=float).tocsr() 
    return [X_train_sparse, X_test_sparse]


def sparse_matrix_to_vw(X_sparse, sites_columns_num, sites_seq_columns_num, y=None, weights=None, mark_duplicates=False, mycolumns=[]):
    sessions = {}
    used = {}
    prediction = {}
    day_of_week = {}
    start_hour = {}
    daytime = {}
    unique_sites = {}
    top30_portion = {}
    fb_portion = {}
    youtube_portion = {}
    bot30_portion = {}
    site_longest_time = {}
    session_timespan = {}
    sitetimes = {}
    siterels = {}
    sequence = {}
    
    lables = {}
    lable_weights = {}
    
    #X_sparse = X_sparse_full[:,:-1]
    
    add_features = True
    for r, c in zip(X_sparse.nonzero()[0], X_sparse.nonzero()[1]):
        if tuple([r,c]) not in used:
            used[tuple([r, c])] = 1
            if add_features:
                #if c >= X_sparse.shape[1] - 2 * sites_columns_num - 550 and \
                    #c < X_sparse.shape[1] - 2 * sites_columns_num:
                    #sites_length = X_sparse.shape[1] - 2 * sites_columns_num - len(mycolumns) - 550
                    #if r not in prediction:
                        #prediction[r] = " |aprediction {}:{}".format(int(c - sites_length - len(mycolumns) + 1), int(X_sparse[r,c]))
                    #else:
                        #prediction[r] += " {}:{}".format(int(c - sites_length - len(mycolumns) + 1), int(X_sparse[r,c]))
                    #prediction[r] = " |prediction:100 {}".format(int(X_sparse[r,c]))
                    #continue
                if c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("prediction"):
                    prediction[r] = " |aprediction {}:{}".format(int(X_sparse[r,c]), 100)
                    continue
                if c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("day_of_week"): #- 2 * sites_columns_num - 550:
                    day_of_week[r] = " |bday_of_week {}".format(int(X_sparse[r,c]))
                    #day_of_week[r] = " day_of_week:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("start_hour"): # - 2 * sites_columns_num  - 550:
                    start_hour[r] = " |chour_start {}".format(int(X_sparse[r,c]))
                    #start_hour[r] = " start_hour:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("daytime"): # - 2 * sites_columns_num  - 550:
                    daytime[r] = " |dtime_of_day {}".format(int(X_sparse[r,c]))
                    #daytime[r] = " daytime:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("session_timespan"): # - 2 * sites_columns_num  - 550:
                    session_timespan[r] = " |jsession_timespan time:{}".format(int(X_sparse[r,c]))
                    #session_timespan[r] = " session_timespan:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("#unique_sites"): # - 2 * sites_columns_num - 550:
                    unique_sites[r] = " unique_sites:{}".format(int(X_sparse[r,c]))
                    #unique_sites[r] = " unique_sites:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("site_longest_time"): # - 2 * sites_columns_num - 550:
                    site_longest_time[r] = " |hsite_longest_time {}:{}".format(int(X_sparse[r,c]), 3)
                    #site_longest_time[r] = " site_longest_time:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("top30_portion"): # - 2 * sites_columns_num - 550:
                    top30_portion[r] = " top30:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("bot30_portion"): # - 2 * sites_columns_num - 550:
                    bot30_portion[r] = " bot30:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("fb_portion"): # - 2 * sites_columns_num - 550:
                    fb_portion[r] = " facebook:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) + mycolumns.index("youtube_portion"): # - 2 * sites_columns_num - 550:
                    youtube_portion[r] = " youtube:{}".format(X_sparse[r,c])
                    continue
                    
            if c < sites_columns_num: #X_sparse.shape[1] - len(mycolumns): #
                if r in sessions:
                    sessions[r] += " {}:{}".format(int(c+1), X_sparse[r,c])
                else:
                    if y is not None:
                        sessions[r] = ' |site' + " {}:{}".format(int(c+1), X_sparse[r,c])
                        lables[r] = str(y[r])
                        if weights is not None:
                            lable_weights[r] = str(weights[y[r]-1])
                    else:
                        sessions[r] = ' |site' + " {}:{}".format(int(c+1), X_sparse[r,c])
            elif c >= sites_columns_num and c < sites_columns_num + sites_seq_columns_num:
                if r in sequence:
                    sequence[r] += " {}:{}".format(int(c+1), X_sparse[r,c])
                else:
                    sequence[r] = ' |zsequence' + " {}:{}".format(int(c+1), X_sparse[r,c])
            #elif c >= X_sparse.shape[1] - 2 * sites_columns_num and c < X_sparse.shape[1] - sites_columns_num:
                #if r in sitetimes:
                    #sitetimes[r] += " {}:{}".format(int(c - sites_columns_num - len(mycolumns)+1 - 550), float(X_sparse[r,c]))
                #else:
                    #sitetimes[r] = ' |isitetime' + " {}:{}".format(int(c - sites_columns_num - len(mycolumns)+1 - 550), float(X_sparse[r,c]))
            #elif c >= X_sparse.shape[1] - sites_columns_num:
                #if r in siterels:
                    #siterels[r] += " {}:{}".format(int(c - 2 * sites_columns_num - len(mycolumns)+1 - 550), float(X_sparse[r,c]))
                #else:
                    #siterels[r] = ' |lsiterels' + " {}:{}".format(int(c - 2 * sites_columns_num - len(mycolumns)+1 - 550), float(X_sparse[r,c]))
        
    
    return {"sites": sessions, "lables": lables, "lable_weights": lable_weights, "prediction": prediction, "day_of_week": day_of_week, \
                      "start_hour": start_hour, "daytime": daytime, \
                     "unique_site": unique_sites, "top30_portion": top30_portion, \
                    "bot30_portion": bot30_portion, "fb_portion": fb_portion, \
                    "youtube_portion": youtube_portion, "site_longest_time": site_longest_time, \
                    "session_timespan": session_timespan, "sitetimes": sitetimes, "siterels": siterels, "sequence": sequence}



def vw_to_file(sites, out_file, features={}, lables={}, lable_weights={},  quiet=True):   
    vw_writer = open(out_file, 'w')
    final_vw = {}
    gen_features = []
    
    if not quiet:
        print "Features:", features.keys()
        
    for r in sorted(sites.keys()):
        if r in lables:
            final_vw[r] = lables[r]
        else:
            final_vw[r] = ""
        if r in lable_weights:
            final_vw[r] += " {}".format(lable_weights[r])
        final_vw[r] += sites[r] #+ " |features"
        for fname, feature in features.items():
            if fname in ["youtube_portion", "fb_portion", "top30_portion", "bot30_portion", \
                                         "unique_sites"] and r in feature:
                gen_features.append(feature[r])
                continue
            if r in feature:
                final_vw[r] += feature[r]        
            
        if len(gen_features):
            final_vw[r] += " |features"
            for gf in gen_features:
                final_vw[r] += gf
        gen_features = []
        
        #if "prediction" in features and r in features["prediction"]:
            #final_vw[r] += features["prediction"][r]
        
        vw_writer.write(final_vw[r] + "\n")
        
    vw_writer.close()
    
    
def write_to_submission_file(predicted_labels, out_file,
                             target='user_id', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)



In [5]:
def calc_predictions(train_data, test_data):
    test_row_users = {}
    train_row_users = {}
    
    # Add predictions from the dataframe (based on uniquely visited site)
    for r, v in test_data[["prediction"]].iterrows():
        if int(v) != 0:
            test_row_users[r] = [int(v)]
    
    
    #Identify sessions with identical sites sequence
    train_index_full = list(train_data.index)
    train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]].index)

    test_index_full = list(test_data.index)
    test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]].index)
    
    train_user_dup_rows_dict = {}
    train_dup_row_users_dict = {}

    test_dup_rows_dict = {}
    

    sites_cols = ['site' + str(c) for c in range(1,10+1)]
    
    for r, row in train_data.iloc[train_index_dup][sites_cols+["target"]].iterrows():
        if row["target"] in train_user_dup_rows_dict:
            if tuple(row[sites_cols]) in train_user_dup_rows_dict[row["target"]]:
                train_user_dup_rows_dict[row["target"]][tuple(row[sites_cols])] += 1
            else:
                train_user_dup_rows_dict[row["target"]][tuple(row[sites_cols])] = 1 
        else:
            train_user_dup_rows_dict[row["target"]] = {tuple(row[sites_cols]): 1}

        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            train_dup_row_users_dict[tuple(row[sites_cols])].add(row["target"])
        else:
            train_dup_row_users_dict[tuple(row[sites_cols])] = set([row["target"]])

    for r, row in test_data.iloc[test_index_dup][sites_cols].iterrows():  
        if tuple(row[sites_cols]) in test_dup_rows_dict:
            test_dup_rows_dict[tuple(row[sites_cols])] += 1
        else:
            test_dup_rows_dict[tuple(row[sites_cols])] = 1

        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            if r in test_row_users:
                pass #don't overwright predictions from the dataframe
                #test_row_users[r] += train_dup_row_users_dict[tuple(row[sites_cols])]
            else:
                test_row_users[r] = train_dup_row_users_dict[tuple(row[sites_cols])]
        
    # Find users who visited 2 websites
    site_pairs = {}
    for r, row in train_data[sites_cols+["target"]].iterrows():
        unique_sites = Counter(row).keys()
        if 0 in unique_sites:
            del unique_sites[unique_sites.index(0)]
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
        if len(unique_sites) > 3:
            for subset in itertools.permutations(Counter(row).keys(), 4):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
    
    
    # Add predictions to test data based on 2 visited websites
    for r, row in test_data[sites_cols].iterrows():
        unique_sites = Counter(row).keys()
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if subset in site_pairs:
                    if r in test_row_users:
                        pass
                    else:
                        test_row_users[r] = list(site_pairs[subset])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if subset in site_pairs:
                    if r in test_row_users:
                        pass
                    else:
                        test_row_users[r] = list(site_pairs[subset])
        if len(unique_sites) > 3:
            for subset in itertools.permutations(Counter(row).keys(), 4):
                if subset in site_pairs:
                    if r in test_row_users:
                        pass
                    else:
                        test_row_users[r] = list(site_pairs[subset])
        
    
    
    return test_row_users, site_pairs

In [6]:
def calc_predictions(train_data, test_data, site_dic, user_dic, min_users, max_users, permutations=False):
    train_row_users = {}
    test_row_users = {}
    
    sites_cols = ['site' + str(c) for c in range(1,10+1)]
    
    # Add predictions from the dataframe (based on uniquely visited site)
    for r, v in train_data[["prediction"]].iterrows():
        if int(v) != 0:
            train_row_users[r] = {int(v): 1}  
    
    for r, v in test_data[["prediction"]].iterrows():
        if int(v) != 0:
            test_row_users[r] = {int(v): 1}
    
    # Add predictions if a website in session was visited by less than num_users_for_prediction
    for r, row in train_data[sites_cols+["target"]].iterrows():
        if r in train_row_users:
            continue
        session_predictions = {}
        for site in row:
            predictions = set([])
            if site in site_dic and site in user_dic[int(row["target"])] \
                          and len(site_dic[site]) in range(min_users, max_users+1):
                predictions = set(site_dic[site])
            if len(predictions):
                for puser in predictions:
                    if puser in session_predictions:
                        session_predictions[puser] +=1
                    else:
                        session_predictions[puser] = 1
                #session_predictions |= predictions
        if len(session_predictions):
            train_row_users[r] = session_predictions
    
    
    for r, row in test_data[sites_cols].iterrows():
        if r in test_row_users:
            continue
        session_predictions = {}
        for site in row:
            predictions = set([])
            if site in site_dic and len(site_dic[site]) in range(min_users, max_users):
                predictions = set(site_dic[site])
            if len(predictions):
                for puser in predictions:
                    if puser in session_predictions:
                        session_predictions[puser] +=1
                    else:
                        session_predictions[puser] = 1
                #session_predictions |= predictions
        if len(session_predictions):
            test_row_users[r] = session_predictions
    
    if not permutations:
        return train_row_users, test_row_users
    
    #Identify sessions with identical sites sequence
    train_index_full = list(train_data.index)
    train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]].index)

    test_index_full = list(test_data.index)
    test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]].index)
    
    train_user_dup_rows_dict = {}
    train_dup_row_users_dict = {}

    #test_dup_rows_dict = {} 

    
    
    for r, row in train_data.ix[train_index_dup][sites_cols+["target"]].iterrows():
        if row["target"] in train_user_dup_rows_dict:
            if tuple(row[sites_cols]) in train_user_dup_rows_dict[row["target"]]:
                train_user_dup_rows_dict[row["target"]][tuple(row[sites_cols])] += 1
            else:
                train_user_dup_rows_dict[row["target"]][tuple(row[sites_cols])] = 1 
        else:
            train_user_dup_rows_dict[row["target"]] = {tuple(row[sites_cols]): 1}

        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            train_dup_row_users_dict[tuple(row[sites_cols])].add(row["target"])
        else:
            train_dup_row_users_dict[tuple(row[sites_cols])] = set([row["target"]])
    
    # Make predictions based on duplicate sessions
    for r, row in train_data.ix[train_index_dup][sites_cols].iterrows():        
        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            if r in train_row_users:
                pass #don't overwright predictions from the dataframe
                #train_row_users[r] += train_dup_row_users_dict[tuple(row[sites_cols])]
            else:
                train_row_users[r] = train_dup_row_users_dict[tuple(row[sites_cols])]
    
    for r, row in test_data.ix[test_index_dup][sites_cols].iterrows():  
        #if tuple(row[sites_cols]) in test_dup_rows_dict:
            #test_dup_rows_dict[tuple(row[sites_cols])] += 1
        #else:
            #test_dup_rows_dict[tuple(row[sites_cols])] = 1

        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            if r in test_row_users:
                pass #don't overwright predictions from the dataframe
                #test_row_users[r] += train_dup_row_users_dict[tuple(row[sites_cols])]
            else:
                test_row_users[r] = train_dup_row_users_dict[tuple(row[sites_cols])]
    

    
    
    # Find users who visited 2, 3, 4 websites
    site_pairs = {}
    for r, row in train_data[sites_cols+["target"]].iterrows():
        unique_sites = Counter(row).keys()
        if 0 in unique_sites:
            del unique_sites[unique_sites.index(0)]
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
        #if len(unique_sites) > 3:
            #for subset in itertools.permutations(Counter(row).keys(), 4):
                #if tuple(subset) in site_pairs:
                    #site_pairs[tuple(subset)].add(row["target"])
                #else:
                    #site_pairs[tuple(subset)] = set([row["target"]])
    
    # Add predictions to train data based on 2 visited websites
    for r, row in train_data[sites_cols+["target"]].iterrows():
        if r in train_row_users:
            continue
        unique_sites = Counter(row).keys()
        if 0 in unique_sites:
            del unique_sites[unique_sites.index(0)]
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if tuple(subset) in site_pairs:
                    if r in train_row_users:
                        train_row_users[r] |= site_pairs[subset]
                    else:
                        train_row_users[r] = set(site_pairs[subset])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if tuple(subset) in site_pairs:
                    if r in test_row_users:
                        train_row_users[r] |= site_pairs[subset]
                    else:
                        train_row_users[r] = set(site_pairs[subset])
        #if len(unique_sites) > 3:
            #for subset in itertools.permutations(Counter(row).keys(), 4):
                #if tuple(subset) in site_pairs:
                    #if r in test_row_users:
                        #train_row_users[r].add(site_pairs[subset])
                    #else:
                        #train_row_users[r] = set(site_pairs[subset])
    
    # Add predictions to test data based on 2 visited websites
    for r, row in test_data[sites_cols].iterrows():
        if r in test_row_users:
            continue
        unique_sites = Counter(row).keys()
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if subset in site_pairs:
                    if r in test_row_users:
                        test_row_users[r] |= site_pairs[subset]
                    else:
                        test_row_users[r] = set(site_pairs[subset])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if subset in site_pairs:
                    if r in test_row_users:
                        test_row_users[r] |= site_pairs[subset]
                    else:
                        test_row_users[r] = set(site_pairs[subset])
        #if len(unique_sites) > 3:
            #for subset in itertools.permutations(Counter(row).keys(), 4):
                #if subset in site_pairs:
                    #if r in test_row_users:
                        #test_row_users[r].add(site_pairs[subset])
                    #else:
                        #test_row_users[r] = set(site_pairs[subset])
        
    
    
    return train_row_users, test_row_users

In [7]:
def create_user_site_dic(train_data, site_freq_pkl):
    user_dic = {}
    site_dic = {}

    pkl_file = open(site_freq_pkl, 'rb')
    site_freq = pickle.load(pkl_file)
    #top_sites = [v[1] for k, v in sorted(site_freq.items(), key=lambda t: t[1][1], reverse=True)[:0]]
    
    for i, v in train_data.iterrows():
        if v.target not in user_dic:
            user_dic[v.target] = {}
        for site in ['site' + str(i) for i in range(1,11)]:
            if int(v[site]) != 0: #and v[site] not in top_sites:
                if v[site] in user_dic[v.target]:
                    user_dic[v.target][v[site]] +=1
                else:
                    user_dic[v.target][v[site]] = 1

                if v[site] in site_dic:
                    site_dic[v[site]].add(v.target)
                else:
                    site_dic[v[site]] = set([v.target])
    
    return user_dic, site_dic

In [8]:
def text_classifier(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
        )


# Let's Start

In [22]:
%%time
#1
train_data = pd.read_csv('train_sessions.csv', index_col="session_id", parse_dates=range(2,22,2), infer_datetime_format=True).fillna(0)
test_data = pd.read_csv('test_sessions.csv', index_col="session_id", parse_dates=range(2,22,2), infer_datetime_format=True).fillna(0)

CPU times: user 1.61 s, sys: 180 ms, total: 1.79 s
Wall time: 1.79 s


In [10]:
def sdh(row, n):
    if int(row["site"+str(n)]) != 0:
        return "{}z{}z{}".format(str(int(row["site"+str(n)])), \
                            str(int(row["day_of_week"])), str(int(row["start_hour"])))
    else:
        return ""
def sd(row, n):
    if int(row["site"+str(n)]) != 0:
        return "{}z{}".format(str(int(row["site"+str(n)])), \
                            str(int(row["day_of_week"])))
    else:
        return ""
def sh(row, n):
    if int(row["site"+str(n)]) != 0:
        return "{}z{}".format(str(int(row["site"+str(n)])), \
                            str(int(row["start_hour"])))
    else:
        return ""

In [11]:
%%time
for n in range(1, 11):
    train_data["sdh"+str(n)] = train_data.apply(lambda x: sdh(x, n), axis=1)

CPU times: user 59.8 s, sys: 700 ms, total: 1min
Wall time: 1min


In [12]:
%%time
for n in range(1, 11):
    train_data["sd"+str(n)] = train_data.apply(lambda x: sd(x, n), axis=1)

CPU times: user 37.3 s, sys: 348 ms, total: 37.6 s
Wall time: 37.6 s


In [13]:
%%time
for n in range(1, 11):
    train_data["sh"+str(n)] = train_data.apply(lambda x: sh(x, n), axis=1)

CPU times: user 50.6 s, sys: 592 ms, total: 51.2 s
Wall time: 51.1 s


In [14]:
%%time
for n in range(1, 11):
    test_data["sdh"+str(n)] = test_data.apply(lambda x: sdh(x, n), axis=1)

CPU times: user 23.4 s, sys: 60 ms, total: 23.5 s
Wall time: 23.4 s


In [15]:
%%time
for n in range(1, 11):
    test_data["sd"+str(n)] = test_data.apply(lambda x: sd(x, n), axis=1)

CPU times: user 16.2 s, sys: 68 ms, total: 16.2 s
Wall time: 16.2 s


In [16]:
%%time
for n in range(1, 11):
    test_data["sh"+str(n)] = test_data.apply(lambda x: sh(x, n), axis=1)

CPU times: user 16.4 s, sys: 28 ms, total: 16.4 s
Wall time: 16.4 s


In [17]:
_end = '_end_'
def make_trie(session):
    root = dict()
    current_dict = root
    for site in session:
        current_dict = current_dict.setdefault(site, {})
    current_dict[_end] = _end
    return root

def in_trie(trie, sequence):
    current_dict = trie
    found = False
    num = len(sequence)
    counter = 0
    for site in sequence:
        while _end not in current_dict:
            if site in current_dict:
                found = True
                counter += 1
                current_dict = current_dict[site]
                break
            else:
                found = False
                current_dict = current_dict.itervalues().next()
                if _end in current_dict:
                    return False
    else:
        if found == True and counter == num:
            return True
        else:
            return False

In [21]:
trie = make_trie(train_test_df_sites.iloc[0].as_matrix().astype(str))
trie

NameError: name 'train_test_df_sites' is not defined

In [620]:
in_trie(trie, ['3537', '5759'])

True

In [35]:
train_data[['sdh' + str(c) for c in range(1,10+1)]]

Unnamed: 0,sdh1,sdh2,sdh3,sdh4,sdh5,sdh6,sdh7,sdh8,sdh9,sdh10
0,3537z4z14,3537z4z14,3537z4z14,5759z4z14,37z4z14,16z4z14,3537z4z14,47z4z14,17z4z14,53z4z14
1,47z4z14,37z4z14,53z4z14,56z4z14,3537z4z14,47z4z14,53z4z14,47z4z14,17z4z14,53z4z14
2,22z4z14,53z4z14,17z4z14,6z4z14,22z4z14,22z4z14,22z4z14,22z4z14,90z4z14,22z4z14
3,22z4z14,87z4z14,22z4z14,22z4z14,14326z4z14,14326z4z14,87z4z14,14326z4z14,14334z4z14,14326z4z14
4,14334z4z14,14326z4z14,,,,,,,,
5,540z4z15,56z4z15,16z4z15,47z4z15,47z4z15,17z4z15,53z4z15,1027z4z15,347z4z15,16z4z15
6,347z4z16,347z4z16,90z4z16,22z4z16,680z4z16,690z4z16,662z4z16,690z4z16,662z4z16,662z4z16
7,690z4z16,38z4z16,680z4z16,690z4z16,662z4z16,680z4z16,690z4z16,662z4z16,662z4z16,690z4z16
8,680z4z16,680z4z16,662z4z16,690z4z16,662z4z16,662z4z16,882z4z16,690z4z16,680z4z16,690z4z16
9,2415z4z16,680z4z16,690z4z16,2415z4z16,662z4z16,882z4z16,2415z4z16,690z4z16,690z4z16,882z4z16


In [None]:
%%time
# Let's find frequent patterns is sessions for every user
train_df_sites = train_data[['sd' + str(c) for c in range(1,10+1)]+["target"]]
user_seq_dic = {}

for r in range(len(train_df_sites)):
    session = train_df_sites.iloc[r,:-1].as_matrix().astype(str)
    session = np.delete(session, np.where(session.astype(str) == "")[0])
    sequence = []
    new_seq = True
    if train_df_sites.iloc[r,-1] not in user_seq_dic:
        user_seq_dic[train_df_sites.iloc[r,-1]] = {}
    seq_dic = user_seq_dic[train_df_sites.iloc[r,-1]]

    ses_seq_set = set()   

    it = np.nditer(session, flags=['f_index'])
    while not it.finished:
        ses_seq_set.add(tuple([str(it[0])]))
        asession = session[it.index+1:].copy()
        if len(asession) > 0:      
            ita = np.nditer(asession, flags=['f_index'])
            while not ita.finished:
                ses_seq_set.add(tuple([str(it[0]), str(ita[0])]))
                bsession = asession[ita.index+1:]
                if len(bsession) > 0:
                    itb = np.nditer(bsession, flags=['f_index'])
                    while not itb.finished:
                        ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0])]))
                        csession = bsession[itb.index+1:]
                        if len(csession) > 0:
                            itc = np.nditer(csession, flags=['f_index'])
                            while not itc.finished:
                                ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0])]))
                                dsession = csession[itc.index+1:]
                                if len(dsession) > 0:
                                    itd = np.nditer(dsession, flags=['f_index'])
                                    while not itd.finished:
                                        ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0])]))
                                        esession = dsession[itd.index+1:]
                                        if len(esession) > 0:
                                            ite = np.nditer(esession, flags=['f_index'])
                                            while not ite.finished:
                                                ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0]), str(ite[0])]))
                                                fsession = esession[ite.index+1:]
                                                if len(fsession) > 0:
                                                    itf = np.nditer(fsession, flags=['f_index'])
                                                    while not itf.finished:
                                                        ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0]), str(ite[0]), str(itf[0])]))
                                                        gsession = fsession[itf.index+1:]
                                                        if len(gsession) > 0:
                                                            itg = np.nditer(gsession, flags=['f_index'])
                                                            while not itg.finished:
                                                                ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0]), str(ite[0]), str(itf[0]), str(itg[0])]))
                                                                itg.iternext()
                                                        itf.iternext()
                                                ite.iternext()
                                        itd.iternext()
                                itc.iternext()
                        itb.iternext()
                ita.iternext()
        it.iternext()
    
    for seq in ses_seq_set:
        if seq in seq_dic:
            seq_dic[seq] +=1
        else:
            seq_dic[seq] = 1

CPU times: user 15min 46s, sys: 10.2 s, total: 15min 56s
Wall time: 15min 55s


In [20]:
%%time
# Early Pruning
for us, ses in user_seq_dic.items():
    for seq, sup in ses.items():
        if sup < 2 or len(seq) < 2:
            del ses[seq]

CPU times: user 55.2 s, sys: 48 ms, total: 55.2 s
Wall time: 55.2 s


In [None]:
%%time
seq_user_dic = {}
for us, ses in user_seq_dic.items():
    for seq, sup in ses.items():
        if seq in seq_user_dic:
            seq_user_dic[seq][us] = sup
        else:
            seq_user_dic[seq] = {us: sup}
        del user_seq_dic[us][seq]

In [None]:
Counter([len(k) for k, v in seq_user_dic.items()])

In [None]:
len(seq_user_dic)

In [None]:
Counter([len(v) for k, v in seq_user_dic.items()])

In [None]:
%%time
for seq, us in seq_user_dic.items():
    if max(us.values()) < 3:
        del seq_user_dic[seq]

In [None]:
%%time
for seq, us in seq_user_dic.items():
    if len(us) > 1:
        del seq_user_dic[seq]

In [51]:
seq_user_dic

{('5341z2', '5354z2', '82z2'): {1531: 3},
 ('2367z0', '2184z0', '15137z0', '15137z0', '15137z0'): {3165: 4},
 ('934z1', '903z1', '893z1'): {106: 11},
 ('1447z2', '22z2', '71z2', '1447z2'): {2824: 3},
 ('7z0', '67z0', '2172z0', '11z0'): {537: 3},
 ('21131z1', '21127z1', '21127z1'): {944: 6},
 ('466z5', '32z5', '170z5'): {2897: 4},
 ('71z0', '77z0', '77z0', '22z0', '77z0'): {2824: 3},
 ('11z1', '888z1', '887z1', '934z1'): {106: 5},
 ('1447z2', '64z2', '71z2', '64z2', '71z2'): {2336: 3},
 ('32z1', '4159z1', '4159z1'): {2630: 3},
 ('8z1', '888z1', '893z1', '887z1'): {106: 5},
 ('16167z5', '2099z5', '16154z5'): {3165: 7},
 ('8443z5', '8z5', '466z5'): {2897: 4},
 ('388z0', '3804z0', '388z0'): {537: 4},
 ('3398z0', '3398z0', '3731z0'): {537: 7},
 ('6024z6', '82z6', '6024z6', '79z6', '6024z6'): {380: 3},
 ('8383z2', '17271z2', '369z2', '8383z2', '369z2'): {3324: 6},
 ('17671z1', '11z1'): {3375: 4},
 ('17699z1', '7z1'): {3375: 6},
 ('6239z1', '6239z1', '6239z1', '6239z1', '961z1'): {1924: 6},
 

In [57]:
%%time
train_df_sites = train_data[['sd' + str(c) for c in range(1,10+1)]].astype('str')
train_data["sequence"] = ""
for r in range(len(train_df_sites)):
    session = train_df_sites.iloc[r].as_matrix().astype(str)
    session = np.delete(session, np.where(session.astype(str) == "")[0])

    ses_seq = []

    ses_seq_set = set()
    

    it = np.nditer(session, flags=['f_index'])
    while not it.finished:
        ses_seq_set.add(tuple([str(it[0])]))
        asession = session[it.index+1:].copy()
        if len(asession) > 0:      
            ita = np.nditer(asession, flags=['f_index'])
            while not ita.finished:
                ses_seq_set.add(tuple([str(it[0]), str(ita[0])]))
                bsession = asession[ita.index+1:]
                if len(bsession) > 0:
                    itb = np.nditer(bsession, flags=['f_index'])
                    while not itb.finished:
                        ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0])]))
                        csession = bsession[itb.index+1:]
                        if len(csession) > 0:
                            itc = np.nditer(csession, flags=['f_index'])
                            while not itc.finished:
                                ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0])]))
                                dsession = csession[itc.index+1:]
                                if len(dsession) > 0:
                                    itd = np.nditer(dsession, flags=['f_index'])
                                    while not itd.finished:
                                        ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0])]))
                                        esession = dsession[itd.index+1:]
                                        if len(esession) > 0:
                                            ite = np.nditer(esession, flags=['f_index'])
                                            while not ite.finished:
                                                ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0]), str(ite[0])]))
                                                fsession = esession[ite.index+1:]
                                                if len(fsession) > 0:
                                                    itf = np.nditer(fsession, flags=['f_index'])
                                                    while not itf.finished:
                                                        ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0]), str(ite[0]), str(itf[0])]))
                                                        gsession = fsession[itf.index+1:]
                                                        if len(gsession) > 0:
                                                            itg = np.nditer(gsession, flags=['f_index'])
                                                            while not itg.finished:
                                                                ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0]), str(ite[0]), str(itf[0]), str(itg[0])]))
                                                                itg.iternext()
                                                        itf.iternext()
                                                ite.iternext()
                                        itd.iternext()
                                itc.iternext()
                        itb.iternext()
                ita.iternext()
        it.iternext()
    
    for seq in ses_seq_set:
        if seq in seq_user_dic:
            ses_seq.append("_".join(np.array(seq).astype(str).tolist()))
                          
        if len(ses_seq):
            train_data.set_value(r, -1, " ".join(ses_seq), takeable=True)

CPU times: user 11min 4s, sys: 620 ms, total: 11min 5s
Wall time: 11min 4s


In [15]:
%%time
test_df_sites = test_data[['sd' + str(c) for c in range(1,10+1)]].astype('str')
test_data["sequence"] = ""
for r in range(len(test_df_sites)):
    session = test_df_sites.iloc[r].as_matrix().astype(str)
    session = np.delete(session, np.where(session.astype(str) == "")[0])

    ses_seq = []

    ses_seq_set = set()
    

    it = np.nditer(session, flags=['f_index'])
    while not it.finished:
        ses_seq_set.add(tuple([str(it[0])]))
        asession = session[it.index+1:].copy()
        if len(asession) > 0:      
            ita = np.nditer(asession, flags=['f_index'])
            while not ita.finished:
                ses_seq_set.add(tuple([str(it[0]), str(ita[0])]))
                bsession = asession[ita.index+1:]
                if len(bsession) > 0:
                    itb = np.nditer(bsession, flags=['f_index'])
                    while not itb.finished:
                        ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0])]))
                        csession = bsession[itb.index+1:]
                        if len(csession) > 0:
                            itc = np.nditer(csession, flags=['f_index'])
                            while not itc.finished:
                                ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0])]))
                                dsession = csession[itc.index+1:]
                                if len(dsession) > 0:
                                    itd = np.nditer(dsession, flags=['f_index'])
                                    while not itd.finished:
                                        ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0])]))
                                        esession = dsession[itd.index+1:]
                                        if len(esession) > 0:
                                            ite = np.nditer(esession, flags=['f_index'])
                                            while not ite.finished:
                                                ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0]), str(ite[0])]))
                                                fsession = esession[ite.index+1:]
                                                if len(fsession) > 0:
                                                    itf = np.nditer(fsession, flags=['f_index'])
                                                    while not itf.finished:
                                                        ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0]), str(ite[0]), str(itf[0])]))
                                                        gsession = fsession[itf.index+1:]
                                                        if len(gsession) > 0:
                                                            itg = np.nditer(gsession, flags=['f_index'])
                                                            while not itg.finished:
                                                                ses_seq_set.add(tuple([str(it[0]), str(ita[0]), str(itb[0]), str(itc[0]), str(itd[0]), str(ite[0]), str(itf[0]), str(itg[0])]))
                                                                itg.iternext()
                                                        itf.iternext()
                                                ite.iternext()
                                        itd.iternext()
                                itc.iternext()
                        itb.iternext()
                ita.iternext()
        it.iternext()
    
    for seq in ses_seq_set:
        if seq in seq_user_dic:
            ses_seq.append("_".join(np.array(seq).astype(str).tolist()))
                          
        if len(ses_seq):
            test_data.set_value(r, -1, " ".join(ses_seq), takeable=True)

CPU times: user 3min 34s, sys: 260 ms, total: 3min 35s
Wall time: 3min 34s


In [None]:
#sd test

In [77]:
%%time
#DON'T RUN TOO LONG
train_data["sequence"] = ""
for r, row in train_data.iterrows():
    trie = make_trie(train_data[['sd' + str(c) for c in range(1,10+1)]].iloc[r].as_matrix().astype(str))
    ses_seq = set()
    #for us, hses in user_seq_dic.items():
    for seq, us in seq_user_dic.items():
        #if row["target"] in us:
        sequence = np.array(seq).astype(str).tolist()
        if in_trie(trie, sequence):
            ses_seq |= set(["_".join(sequence)])
    if len(ses_seq):
        train_data.set_value(r, -1, " ".join(ses_seq), takeable=True)

KeyboardInterrupt: 

In [58]:
test_data["sequence"] = "-1"

In [None]:
#DON'T RUN TOO LONG
%%time
test_data["sequence"] = ""
for r, row in test_data.iterrows():
    trie = make_trie(test_data[['sdh' + str(c) for c in range(1,10+1)]].iloc[r].as_matrix().astype(str))
    ses_seq = []
    for seq, sup in reduced_seq_dic.items():
        sequence = np.array(seq).astype(str).tolist()
        if in_trie(trie, sequence):
            ses_seq.append("_".join(sequence))
    if len(ses_seq):
        test_data.set_value(r, -1, " ".join(ses_seq), takeable=True)

In [60]:
train_data[train_data.sequence != ""]

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,sh2,sh3,sh4,sh5,sh6,sh7,sh8,sh9,sh10,sequence
8,680,680,662,690,662,662,882,690,680,690,...,680z16,662z16,690z16,662z16,662z16,882z16,690z16,680z16,690z16,662z4_690z4_882z4_690z4_680z4
10,690,662,882,882,690,680,882,690,2415,680,...,662z16,882z16,882z16,690z16,680z16,882z16,690z16,2415z16,680z16,662z4_690z4_882z4_690z4_680z4
12,680,680,662,690,882,662,690,680,662,662,...,680z16,662z16,690z16,882z16,662z16,690z16,680z16,662z16,662z16,662z4_690z4_882z4_690z4_680z4
14,20,32,662,690,32,690,662,690,662,32,...,32z16,662z16,690z16,32z16,690z16,662z16,690z16,662z16,32z16,662z4_32z4_690z4_32z4 662z4_32z4_690z4_690z4_3...
15,662,32,662,690,690,32,662,662,32,32,...,32z16,662z16,690z16,690z16,32z16,662z16,662z16,32z16,32z16,32z4_662z4_690z4_32z4_32z4 662z4_32z4_690z4_32...
21,825,2483,2497,690,32,662,662,690,32,32,...,2483z16,2497z16,690z16,32z16,662z16,662z16,690z16,32z16,32z16,32z4_662z4_690z4_32z4_32z4
22,32,32,690,662,690,690,32,690,690,32,...,32z16,690z16,662z16,690z16,690z16,32z16,690z16,690z16,32z16,32z4_662z4_690z4_32z4_32z4 662z4_32z4_690z4_32...
33,307,280,307,307,307,307,307,381,8,55,...,280z15,307z15,307z15,307z15,307z15,307z15,381z15,8z15,55z15,280z4_307z4_381z4_8z4_55z4 280z4_381z4_8z4_55z4
144,280,307,307,8,245,38,381,8,55,307,...,307z13,307z13,8z13,245z13,38z13,381z13,8z13,55z13,307z13,280z4_381z4_8z4_55z4 280z4_307z4_381z4_8z4_55z4
186,280,307,38,381,307,8,55,280,307,280,...,307z14,38z14,381z14,307z14,8z14,55z14,280z14,307z14,280z14,280z4_381z4_8z4_55z4 280z4_307z4_381z4_8z4_55z4


In [51]:
%%time
for r, row in train_data[train_data.sequence == ""].iterrows():
    session = train_data[['sdh' + str(c) for c in range(1,10+1)]].iloc[r].as_matrix()
    session = np.delete(session, np.where(session == "")[0])
    train_data.set_value(r, -1, " ".join(session.astype(str).tolist()), takeable=True)

CPU times: user 1min 46s, sys: 3.61 s, total: 1min 50s
Wall time: 1min 50s


In [17]:
%%time
for r, row in test_data[test_data.sequence == ""].iterrows():
    session = test_data[['sdh' + str(c) for c in range(1,10+1)]].iloc[r].as_matrix()
    session = np.delete(session, np.where(session == "")[0])
    test_data.set_value(r, -1, " ".join(session.astype(str).tolist()), takeable=True)

CPU times: user 2.12 s, sys: 1.41 s, total: 3.54 s
Wall time: 3.53 s


In [61]:
train_data.to_csv("full_train_w8_seq_sd_clean.csv", index=False)
test_data.to_csv("full_test_w8_seq_sd_clean.csv", index=False)

In [62]:
train_test_df = pd.concat([train_data, test_data])

session_length = 10

y = train_data["target"]
y_weights = [1.0] * 550

train_test_df_sites = train_test_df.sequence

tfidf = CountVectorizer(token_pattern='\w+').fit(train_test_df_sites.as_matrix())
X_train_test_seq_sparse = tfidf.transform(train_test_df_sites.as_matrix())

#train_test_df_sites = train_test_df[['site' + str(c) for c in range(1,10+1)]].fillna(0).astype('int')
#train_test_df_sites_array = [" ".join([str(s) for s in train_test_df_sites.as_matrix()[i].astype(str) if int(s) != 0]) \
 #                                                             for i in range(train_test_df_sites.shape[0])]


In [63]:
X_train_sites_seq_sparse = X_train_test_seq_sparse[:len(train_data)]
X_test_sites_seq_sparse = X_train_test_seq_sparse[len(train_data):]

class_encoder = LabelEncoder().fit(y.astype('str'))
y_for_vw = class_encoder.transform(y.astype('str')) + 1

sites_seq_columns_num = X_train_test_seq_sparse.shape[1]

In [64]:
%%time
#5
#train_test_df = pd.concat([train_data, test_data])

#session_length = 10
#train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                       #[['site' + str(c) for c in range(1,10+1)]+["target"]].index)

#test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                       #[['site' + str(c) for c in range(1,10+1)]].index)
#train_duplicates_mask = np.transpose([np.in1d(train_index_full, train_index_dup).astype(int)])
#test_duplicates_mask = np.transpose([np.in1d(test_index_full, test_index_dup).astype(int)])

#y = train_data["target"]

train_test_df_sites = train_test_df[['site' + str(c) for c in range(1,10+1)]].fillna(0).astype('int')
train_test_df_sites_array = [" ".join([str(s) for s in train_test_df_sites.as_matrix()[i].astype(str) if int(s) != 0]) \
                                                             for i in range(train_test_df_sites.shape[0])]

tfidf2 = TfidfVectorizer(token_pattern='\w+').fit(train_test_df_sites_array) #TfidfVectorizer()
X_train_test_sparse = tfidf2.transform(train_test_df_sites_array)

X_train_sites_sparse = X_train_test_sparse[:len(train_data)]
X_test_sites_sparse = X_train_test_sparse[len(train_data):]

#class_encoder = LabelEncoder().fit(y.astype('str'))
#y_for_vw = class_encoder.transform(y.astype('str')) + 1

sites_columns_num = X_train_test_sparse.shape[1]
#inv_vocabulary = {v: int(re.search("s_(\d+)$", k).group(1)) for k, v in tfidf.vocabulary_.iteritems()}

#y_weights = [(np.sum(Counter(y_for_vw).values()) - v + min((Counter(y_for_vw).values())))/ \
            #float(np.sum(Counter(y_for_vw).values())) for k, v in sorted(Counter(y_for_vw).items())]

#y_weights = [1.0] * 550

CPU times: user 8.18 s, sys: 88 ms, total: 8.27 s
Wall time: 8.3 s


In [65]:
%%time
#6
mycolumns = [label for label in test_data[range(20, 40)]]

train_features, test_features = features_to_sparse(train_data, test_data, mycolumns)

X_train_sparse, X_test_sparse = combine_sites_features_sparse(X_train_sites_sparse, X_train_sites_seq_sparse, train_features,\
                                                             X_test_sites_sparse, X_test_sites_seq_sparse, test_features)#, test_preds_sparse, \
                                                              #train_site_times_sparse, test_site_times_sparse, \
                                                              #train_site_rels_sparse, test_site_rels_sparse)
                                                             #train_site_sequence, test_site_sequence)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_sparse, y_for_vw, test_size=0.3, stratify=y_for_vw)

#y_train_weights = [(np.sum(Counter(y_train).values()) - v + min((Counter(y_train).values()))) / \
                   #float(np.sum(Counter(y_train).values())) for k, v in sorted(Counter(y_train).items())]

y_train_weights = [1.0] * 550

CPU times: user 3.71 s, sys: 72 ms, total: 3.78 s
Wall time: 3.79 s


In [67]:
%%time
#7
train_part_vw = sparse_matrix_to_vw(X_train, sites_columns_num, sites_seq_columns_num, y_train, weights=y_train_weights, mycolumns=mycolumns)
valid_vw = sparse_matrix_to_vw(X_valid, sites_columns_num, sites_seq_columns_num, y_valid, mycolumns=mycolumns)
train_vw = sparse_matrix_to_vw(X_train_sparse, sites_columns_num, sites_seq_columns_num, y_for_vw, weights=y_weights, mycolumns=mycolumns)
test_vw = sparse_matrix_to_vw(X_test_sparse, sites_columns_num, sites_seq_columns_num, mycolumns=mycolumns)

CPU times: user 9min 22s, sys: 896 ms, total: 9min 23s
Wall time: 9min 22s


In [70]:
for key, val in sorted(train_part_vw.items()):
    print key, len(val)

bot30_portion 29103
day_of_week 76515
daytime 76515
fb_portion 9906
lable_weights 76515
lables 76515
prediction 15823
sequence 31477
session_timespan 75065
site_longest_time 76515
siterels 0
sites 76515
sitetimes 0
start_hour 76515
top30_portion 53953
unique_site 76515
youtube_portion 4242


In [71]:
folder = 'vw/'
handler = '_idf_w8_seq'

In [19]:
print "Saving vw files"
#with open(folder+'train_part'+handler+'.pkl', 'wb') as f:
    #pickle.dump(train_part_vw, f)
#with open(folder+'valid'+handler+'.pkl', 'wb') as f:
    #pickle.dump(valid_vw, f)
with open(folder+'train'+handler+'.pkl', 'wb') as f:
    pickle.dump(train_vw, f)
#with open(folder+'test'+handler+'.pkl', 'wb') as f:
    #pickle.dump(test_vw, f)
#with open(folder+'class_encoder'+handler+'.pkl', 'wb') as f:
    #pickle.dump(class_encoder, f)

#y.to_csv(folder+'y'+handler+'.csv', index=False, header=False)
#pd.DataFrame(y_train).to_csv(folder+'y_train'+handler+'.csv', index=False, header=False)
#pd.DataFrame(y_valid).to_csv(folder+'y_valid'+handler+'.csv', index=False, header=False)

Saving vw files


KeyboardInterrupt: 

In [72]:
%%time
#8

keys = ['day_of_week', 'start_hour', 'sequence', 'prediction']

vw_to_file(train_part_vw["sites"], folder+'train_part'+handler+'.vw', \
           features={x:train_part_vw[x] for x in keys}, \
           lables=train_part_vw["lables"], lable_weights=train_part_vw["lable_weights"], quiet=True)
vw_to_file(valid_vw["sites"], folder+'valid'+handler+'.vw', features={x:valid_vw[x] for x in keys}, \
           lables=valid_vw["lables"], quiet=True)
vw_to_file(train_vw["sites"], folder+'train'+handler+'.vw', features={x:train_vw[x] for x in keys}, \
           lables=train_vw["lables"], lable_weights=train_vw["lable_weights"], quiet=True)
vw_to_file(test_vw["sites"], folder+'test'+handler+'.vw', features={x:test_vw[x] for x in keys}, quiet=True)

CPU times: user 1.16 s, sys: 80 ms, total: 1.24 s
Wall time: 1.24 s


In [420]:
f = open(folder+'train_part'+handler+'.vw')
train_part_file = f.readlines()
f.close()

f = open(folder+'train'+handler+'.vw')
train_file = f.readlines()
f.close()

f = open(folder+'valid'+handler+'.vw')
valid_file = f.readlines()
f.close()

f = open(folder+'test'+handler+'.vw')
test_file = f.readlines()
f.close()

In [75]:
len(valid_file)

32749

In [121]:
%%time
model = VW(oaa=550, passes=20, b=28, convert_to_vw=False, \
          cubic="sbc", q="sd", quiet=False, l=0.45, decay_learning_rate=0.9, loss_function="squared")
model.fit(train_part_file)
vw_pred = model.predict(valid_file)
print accuracy_score(y_valid, vw_pred)

0.540471311475
CPU times: user 43 s, sys: 9.55 s, total: 52.6 s
Wall time: 48.7 s


In [153]:
vw_pred = pd.read_csv("kaggle_data/28vw_submission_idf_w8_1.csv")

In [None]:
validation = pd.DataFrame(np.transpose([y_valid]), columns=["true"])

In [58]:
vw_pred = pd.DataFrame(np.transpose([vw_pred]).astype(int), columns=["vw_pred"])

In [126]:
class_encoder.inverse_transform(516)

'783'

In [154]:
#validation = pd.concat([validation, pd.DataFrame(np.transpose([vw_pred]).astype(int), columns=["vw_pred"])], axis=1)
ind = []
val = []
for i in range(len(test_file)):
    if i in test_vw["prediction"]:
        m = re.search("(\d+):100", test_vw["prediction"][i])
        if m: 
            ind.append(i)
            val.append(class_encoder.inverse_transform(int(m.group(1))-1))
            #val.append(int(m.group(1)))
        else:
            ind.append(i)
            val.append(0)
    else:
        ind.append(i)
        val.append(0)
preds = pd.DataFrame(np.transpose([val]), columns=["mypred"])
validation = pd.concat([vw_pred, preds], axis=1)


In [156]:
def result(row):
    if int(row.mypred) == 0:
        return row["user_id"]
    else:
        return str(row["mypred"])

In [157]:
validation

Unnamed: 0,session_id,user_id,mypred
0,1,537,0
1,2,783,783
2,3,1645,0
3,4,1962,0
4,5,631,631
5,6,1647,0
6,7,537,0
7,8,1113,0
8,9,3342,0
9,10,1505,0


In [155]:
validation[(validation.mypred.astype(int) != 0) & (validation.user_id.astype(int) != validation.mypred.astype(int))]

Unnamed: 0,session_id,user_id,mypred
31,32,494,3075
328,329,558,3324
356,357,2336,231
396,397,537,783
465,466,2902,3165
869,870,2738,2639
971,972,2238,2693
1154,1155,467,2942
1162,1163,2946,3065
1185,1186,2971,860


In [158]:
validation["result"] = validation.apply(lambda row: result(row), axis=1)

In [159]:
validation.drop(["user_id", "mypred"], axis=1).to_csv("kaggle_data/43_vw_submission_manual_pred.csv", index=False, header=["session_id", "user_id"])

In [83]:
vw_test_pred = pd.DataFrame(validation["result"])
t_submission = pd.DataFrame(vw_test_pred.astype(int)-1)
vw_subm = class_encoder.inverse_transform(t_submission)

In [85]:
write_to_submission_file(vw_subm,
             'kaggle_data/40vw_submission_pred.csv')
print "Finished creating submission.\n"

Finished creating submission.



In [73]:
%%time
#9
!vw --oaa=550 -d {folder}train_part{handler}.vw \
-f {folder}initial_model{handler}.model -b 27 -c -k \
--passes=5 -l 0.45 --decay_learning_rate=0.9 \
--keep "b" --keep "c" --keep "z" --keep "a" --keep "s" --cubic "sbc"

!vw -i {folder}initial_model{handler}.model  -t -d {folder}valid{handler}.vw \
-p {folder}vw_valid_pred{handler}.csv --quiet

vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
accuracy = accuracy_score(y_valid, vw_valid_pred.values)
print "Accuracy:", accuracy

creating cubic features for triples: sbc 
using namespaces beginning with: b c z a s 
final_regressor = vw/initial_model_idf_w8_seq.model
Num weight bits = 27
learning rate = 0.45
initial_t = 0
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = vw/train_part_idf_w8_seq.vw.cache
Reading datafile = vw/train_part_idf_w8_seq.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0      159        1       15
1.000000 1.000000            2            2.0       12      159      183
1.000000 1.000000            4            4.0      413      159       13
1.000000 1.000000            8            8.0      141      159       19
1.000000 1.000000           16           16.0      491      141        9
0.968750 0.937500           32           32.0      246      159       15
0.937500 0.906250           64           64.0      509      386

In [282]:
%%time
!vw --oaa=550 -d {folder}train_part{handler}.vw \
-f {folder}initial_model{handler}.model -b 28 -c -k \
--passes=30 --decay_learning_rate 0.9 --initial_t 0.002337045080352835 \
-l 0.5416950450219994 \
--power_t 0.5 --loss_function='logistic' --l1 1e-11 --l2 1e-11 \
--cubic="sbc"  \
--keep "s" --keep "b" --keep "c" --keep "l" --keep "a" \
--stage_poly --batch_sz {len(train_part_file)/6} --batch_sz_no_doubling

creating cubic features for triples: sbc 
using namespaces beginning with: s b c l a 
using l1 regularization = 1e-11
using l2 regularization = 1e-11
final_regressor = kaggle_data/initial_model_idf_w8_seq.model
Num weight bits = 28
learning rate = 0.541695
initial_t = 0.00233705
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = kaggle_data/train_part_idf_w8_seq.vw.cache
Reading datafile = kaggle_data/train_part_idf_w8_seq.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0       69        1       55
1.000000 1.000000            2            2.0      276       69       11
1.000000 1.000000            4            4.0      364      276       11
1.000000 1.000000            8            8.0      145       12       51
1.000000 1.000000           16           16.0      291      276       45
1.000000 1.000000           32  

In [408]:
!vw -i {folder}initial_model{handler}.model  -t -d {folder}valid{handler}.vw \
-p {folder}vw_valid_pred{handler}.csv --quiet

vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
accuracy = accuracy_score(y_valid, vw_valid_pred.values)
print "Accuracy:", accuracy

Accuracy: 0.596499252889


## Confusion Matrix

In [404]:
countery = Counter(y_train)
confusion = {}

In [405]:
M = confusion_matrix(y_valid, vw_valid_pred)
M_normalized = M.astype('float') / M.sum(axis=1)[:, np.newaxis]
for (t,f), value in np.ndenumerate(M):
    if t != f and value > 0:
        confusion[tuple([t, f])] = value

In [371]:
one_confusion = {}
for k, v in confusion.items():
    if tuple([k[1], k[0]]) not in confusion:
        one_confusion[k] = v
two_confusion = {}
for k, v in confusion.items():
    if tuple([k[1], k[0]]) in confusion and tuple([k[1], k[0]]) not in two_confusion:
        two_confusion[k] = v

In [376]:
train_data

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,site_longest_time,start_hour,day_of_week,daytime,fb_portion,youtube_portion,top30_portion,bot30_portion,prediction,target
0,3537,3537,3537,5759,37,16,3537,47,17,53,...,5759,14,4,1,0.000000,0.0,0.000000,0.500000,0,280
1,47,37,53,56,3537,47,53,47,17,53,...,56,14,4,1,0.000000,0.0,0.000000,0.000000,0,280
2,22,53,17,6,22,22,22,22,90,22,...,17,14,4,1,0.000000,0.0,0.029412,0.000000,0,280
3,22,87,22,22,14326,14326,87,14326,14334,14326,...,22,14,4,1,0.000000,0.0,0.428571,0.571429,0,280
4,14334,14326,0,0,0,0,0,0,0,0,...,14334,14,4,1,0.000000,0.0,0.000000,1.000000,0,280
5,540,56,16,47,47,17,53,1027,347,16,...,53,15,4,1,0.000000,0.0,0.000000,0.000000,0,280
6,347,347,90,22,680,690,662,690,662,662,...,347,16,4,1,0.000000,0.0,0.115385,0.000000,0,280
7,690,38,680,690,662,680,690,662,662,690,...,38,16,4,1,0.000000,0.0,0.615385,0.000000,0,280
8,680,680,662,690,662,662,882,690,680,690,...,662,16,4,1,0.000000,0.0,0.000000,0.000000,0,280
9,2415,680,690,2415,662,882,2415,690,690,882,...,882,16,4,1,0.000000,0.0,0.000000,0.000000,0,280


In [378]:
for a, b in [pair for pair, val in sorted(two_confusion.items(), key=lambda t:t[1], reverse=True)]:
    train_data = train_data[train_data.target != a].copy()
    train_data = train_data[train_data.target != b].copy()

226 317
351 235
316 137
329 248
363 178
510 251
266 140
34 371
112 405
245 539
306 195
138 380
548 154
82 240
502 314
72 496
376 134
116 228
461 17
4 342
242 399
359 17
238 441
49 468
416 385
445 375
291 94
97 17
353 378
272 321
123 298
416 468
296 342
37 378
3 56
296 4
144 486
541 493
180 168
289 346
461 97
385 468
533 499
9 269
403 422
533 123
124 147
175 307
472 317
71 129
469 40
298 533
517 17
353 37
476 293
547 334
49 416
184 63
415 63
307 293
261 191
424 108
135 299
472 226
419 270
422 298
323 171
385 49
19 214
270 213
159 394
159 181
285 518
270 470
47 401
311 512
87 327
419 470
517 457
297 58
293 175
544 268
185 22
293 171
250 419
282 517
403 17
215 522
417 501
219 534
400 389
538 352
499 123
268 226
472 268
501 462
62 117
385 363
220 419
415 519
54 198
353 371
268 317
177 436
178 214
11 213
293 133
175 17
421 152
148 230
286 471
132 169
207 514
26 499
336 270
443 7
326 385
426 378
406 167
426 457
106 303
179 26
424 403
446 322
326 195
88 443
34 353
324 172
214 91
34 457
459 23

In [383]:
len(Counter(train_data.target))

477

In [374]:
np.mean(two_confusion.values())

4.1379310344827589

In [375]:
[[[tf[0], countery[tf[0]+1]], [tf[1], countery[tf[1]+1]], [val]] for tf, val in sorted(two_confusion.items(), \
                key=lambda t: t[1], reverse = True)]

[[[226, 944], [317, 942], [193]],
 [[351, 398], [235, 395], [124]],
 [[316, 505], [137, 502], [109]],
 [[329, 431], [248, 428], [103]],
 [[363, 412], [178, 414], [103]],
 [[510, 329], [251, 322], [87]],
 [[266, 316], [140, 317], [78]],
 [[34, 351], [371, 353], [69]],
 [[112, 232], [405, 232], [52]],
 [[245, 272], [539, 269], [49]],
 [[306, 147], [195, 146], [44]],
 [[138, 187], [380, 187], [43]],
 [[548, 170], [154, 171], [42]],
 [[82, 150], [240, 150], [40]],
 [[502, 170], [314, 170], [37]],
 [[72, 115], [496, 115], [33]],
 [[376, 137], [134, 135], [28]],
 [[116, 146], [228, 146], [26]],
 [[461, 186], [17, 377], [24]],
 [[4, 135], [342, 137], [23]],
 [[242, 87], [399, 87], [22]],
 [[359, 113], [17, 377], [22]],
 [[238, 106], [441, 104], [22]],
 [[49, 725], [468, 1586], [21]],
 [[416, 466], [385, 2691], [20]],
 [[445, 81], [375, 80], [20]],
 [[291, 69], [94, 68], [19]],
 [[97, 182], [17, 377], [19]],
 [[353, 235], [378, 153], [18]],
 [[272, 94], [321, 94], [17]],
 [[123, 199], [298, 26

In [406]:
pairs = [[class_encoder.inverse_transform(tf[0]), class_encoder.inverse_transform(tf[1])] \
         for tf, val in sorted(confusion.items(), key=lambda t: t[1], reverse = True)]

In [407]:
pairs

[['2824', '2336'],
 ['2336', '2824'],
 ['2820', '1807'],
 ['1807', '2820'],
 ['2366', '2971'],
 ['2434', '2874'],
 ['3060', '2004'],
 ['2971', '2366'],
 ['244', '762'],
 ['2874', '2434'],
 ['2004', '3060'],
 ['1812', '2524'],
 ['762', '244'],
 ['1180', '3102'],
 ['2524', '1812'],
 ['1665', '3260'],
 ['3102', '1180'],
 ['2414', '940'],
 ['3260', '1665'],
 ['940', '2414'],
 ['1808', '313'],
 ['974', '1877'],
 ['313', '1808'],
 ['2342', '17'],
 ['239', '1505'],
 ['728', '280'],
 ['1505', '239'],
 ['280', '728'],
 ['2746', '2161'],
 ['1877', '974'],
 ['17', '2342'],
 ['121', '538'],
 ['1413', '692'],
 ['178', '3119'],
 ['2991', '1200'],
 ['3119', '178'],
 ['692', '1413'],
 ['2383', '387'],
 ['1985', '2773'],
 ['2824', '557'],
 ['2642', '2954'],
 ['2681', '1737'],
 ['572', '2773'],
 ['2161', '2746'],
 ['538', '121'],
 ['387', '2383'],
 ['3165', '3328'],
 ['2336', '557'],
 ['197', '2773'],
 ['2943', '267'],
 ['3165', '537'],
 ['3219', '2394'],
 ['537', '3165'],
 ['2842', '2561'],
 ['1273', '

In [363]:
class_encoder.inverse_transform(226), class_encoder.inverse_transform(317)

('2336', '2824')

In [213]:
class_encoder.transform([str(673)]) + 1

array([494])

In [356]:
train_data[train_data.target == 2336]

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,site_longest_time,start_hour,day_of_week,daytime,fb_portion,youtube_portion,top30_portion,bot30_portion,prediction,target
33258,22,22,90,69,22,87,77,77,22,22,...,87,13,0,1,0.0,0.0,0.333333,0.000000,0,2336
33259,22,22,87,22,22,22,22,22,1447,22,...,87,13,0,1,0.0,0.0,0.054795,0.000000,0,2336
33260,64,1447,70,22,64,22,64,70,71,22,...,22,13,0,1,0.0,0.0,0.666667,0.000000,0,2336
33261,71,22,63,22,64,63,70,70,63,64,...,63,13,0,1,0.0,0.0,0.333333,0.000000,0,2336
33262,71,70,63,64,70,70,64,71,71,63,...,64,13,0,1,0.0,0.0,0.000000,0.000000,0,2336
33263,71,64,63,63,22,64,64,22,63,70,...,64,13,0,1,0.0,0.0,0.250000,0.000000,0,2336
33264,22,70,63,70,71,64,63,64,63,70,...,70,13,0,1,0.0,0.0,0.500000,0.000000,0,2336
33265,64,70,64,22,70,63,64,64,63,70,...,64,13,0,1,0.0,0.0,0.111111,0.000000,0,2336
33266,63,70,64,71,70,71,22,71,63,71,...,71,13,0,1,0.0,0.0,0.375000,0.000000,0,2336
33267,63,70,71,64,22,63,70,63,70,64,...,63,13,0,1,0.0,0.0,0.333333,0.000000,0,2336


In [355]:
train_data[train_data.target == 2824]

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,site_longest_time,start_hour,day_of_week,daytime,fb_portion,youtube_portion,top30_portion,bot30_portion,prediction,target
99472,22,22,22,90,69,87,77,22,22,87,...,87,13,0,1,0.0,0.0,0.360000,0.000000,0,2824
99473,22,87,22,69,77,77,22,22,22,69,...,69,13,0,1,0.0,0.0,1.000000,0.000000,0,2824
99474,22,69,22,64,1447,70,22,71,64,70,...,22,13,0,1,0.0,0.0,1.000000,0.000000,0,2824
99475,71,22,63,64,71,63,70,64,71,63,...,64,13,0,1,0.0,0.0,0.000000,0.000000,0,2824
99476,71,63,71,22,64,63,70,63,70,64,...,63,13,0,1,0.0,0.0,0.500000,0.000000,0,2824
99477,63,70,22,71,63,64,69,70,70,63,...,70,13,0,1,0.0,0.0,0.333333,0.000000,0,2824
99478,70,63,70,64,63,63,22,64,71,71,...,71,13,0,1,0.0,0.0,0.333333,0.000000,0,2824
99479,64,64,22,71,63,70,63,63,70,63,...,64,13,0,1,0.0,0.0,0.166667,0.000000,0,2824
99480,70,64,71,63,63,64,22,71,63,70,...,64,13,0,1,0.0,0.0,0.222222,0.000000,0,2824
99481,63,71,71,64,63,70,64,71,63,70,...,71,13,0,1,0.0,0.0,0.000000,0.000000,0,2824


In [43]:
df= pd.read_csv('kaggle_data/train_sessions.csv')


In [45]:
df[df.user_id == 2631]

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,user_id
5491,5492,509,2014-03-29 11:06:24,858.0,2014-03-29 11:06:24,11.0,2014-03-29 11:06:24,65.0,2014-03-29 11:06:24,268.0,...,2014-03-29 11:06:25,82.0,2014-03-29 11:06:25,241.0,2014-03-29 11:06:31,265.0,2014-03-29 11:06:32,241.0,2014-03-29 11:06:32,2631
6571,6572,32,2014-08-02 10:04:39,32.0,2014-08-02 10:04:40,8.0,2014-08-02 10:04:41,8.0,2014-08-02 10:04:53,32.0,...,2014-08-02 10:04:55,3087.0,2014-08-02 10:14:37,521.0,2014-08-02 10:33:42,523.0,2014-08-02 10:33:42,32.0,2014-08-02 10:33:42,2631
8631,8632,65,2014-01-25 10:30:10,260.0,2014-01-25 10:36:59,412.0,2014-01-25 10:36:59,1361.0,2014-01-25 10:41:10,77.0,...,2014-01-25 10:41:10,1321.0,2014-01-25 10:41:11,1378.0,2014-01-25 10:41:11,1104.0,2014-01-25 10:41:12,2891.0,2014-01-25 10:41:12,2631
12124,12125,523,2014-08-02 10:33:43,524.0,2014-08-02 10:33:43,526.0,2014-08-02 10:33:44,32.0,2014-08-02 10:34:05,13832.0,...,2014-08-02 10:35:01,13836.0,2014-08-02 10:35:52,32.0,2014-08-02 10:35:52,13836.0,2014-08-02 10:35:53,342.0,2014-08-02 10:35:53,2631
14795,14796,32,2014-08-02 11:27:57,,,,,,,,...,,,,,,,,,,2631
15614,15615,32,2014-08-02 11:10:39,32.0,2014-08-02 11:10:42,13834.0,2014-08-02 11:10:43,11583.0,2014-08-02 11:10:44,13834.0,...,2014-08-02 11:10:45,11.0,2014-08-02 11:10:45,625.0,2014-08-02 11:10:45,184.0,2014-08-02 11:10:46,625.0,2014-08-02 11:10:46,2631
16642,16643,2889,2014-02-22 11:26:25,67.0,2014-02-22 11:27:57,7.0,2014-02-22 11:27:58,2889.0,2014-02-22 11:27:58,38.0,...,2014-02-22 11:27:58,88.0,2014-02-22 11:30:31,38.0,2014-02-22 11:30:31,7.0,2014-02-22 11:30:31,2889.0,2014-02-22 11:30:31,2631
16952,16953,544,2014-05-13 08:05:30,525.0,2014-05-13 08:05:52,15.0,2014-05-13 08:05:57,479.0,2014-05-13 08:10:54,5.0,...,2014-05-13 08:12:55,525.0,2014-05-13 08:14:13,544.0,2014-05-13 08:14:13,525.0,2014-05-13 08:14:22,14.0,2014-05-13 08:14:54,2631
20707,20708,8,2014-01-25 10:55:50,32.0,2014-01-25 10:57:02,32.0,2014-01-25 11:01:14,8.0,2014-01-25 11:01:15,32.0,...,2014-01-25 11:01:33,65.0,2014-01-25 11:01:34,13828.0,2014-01-25 11:01:34,38.0,2014-01-25 11:01:35,85.0,2014-01-25 11:01:35,2631
21597,21598,13840,2014-01-25 11:01:35,13828.0,2014-01-25 11:01:35,3430.0,2014-01-25 11:01:35,13833.0,2014-01-25 11:01:35,11.0,...,2014-01-25 11:01:36,13828.0,2014-01-25 11:01:36,8.0,2014-01-25 11:01:37,38.0,2014-01-25 11:01:37,55.0,2014-01-25 11:01:37,2631


In [46]:
df[df.user_id == 786]

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,user_id
2502,2503,1955,2014-08-02 11:06:25,38.0,2014-08-02 11:06:26,38.0,2014-08-02 11:06:27,55.0,2014-08-02 11:06:27,1955.0,...,2014-08-02 11:06:29,1955.0,2014-08-02 11:06:31,1955.0,2014-08-02 11:06:37,32.0,2014-08-02 11:07:45,1946.0,2014-08-02 11:07:46,786
3384,3385,184,2014-01-17 14:31:24,53.0,2014-01-17 14:31:27,47.0,2014-01-17 14:31:27,17.0,2014-01-17 14:31:27,8.0,...,2014-01-17 14:32:43,2141.0,2014-01-17 14:36:02,32.0,2014-01-17 14:38:34,32.0,2014-01-17 14:38:48,,,786
4684,4685,106,2014-03-29 10:03:15,106.0,2014-03-29 10:03:30,106.0,2014-03-29 10:03:31,32.0,2014-03-29 10:03:31,106.0,...,2014-03-29 10:03:34,13839.0,2014-03-29 10:07:08,13839.0,2014-03-29 10:07:10,3087.0,2014-03-29 10:23:30,,,786
5462,5463,32,2014-05-13 15:04:22,32.0,2014-05-13 15:04:23,32.0,2014-05-13 15:04:26,106.0,2014-05-13 15:04:31,32.0,...,2014-05-13 15:05:00,322.0,2014-05-13 15:05:01,322.0,2014-05-13 15:08:05,305.0,2014-05-13 15:08:08,322.0,2014-05-13 15:08:37,786
5553,5554,65,2014-01-25 11:01:34,13840.0,2014-01-25 11:01:35,13828.0,2014-01-25 11:01:35,3430.0,2014-01-25 11:01:35,88.0,...,2014-01-25 11:01:36,8.0,2014-01-25 11:01:36,55.0,2014-01-25 11:01:37,38.0,2014-01-25 11:01:37,13828.0,2014-01-25 11:01:38,786
7182,7183,32,2014-08-02 10:04:39,32.0,2014-08-02 10:04:40,8.0,2014-08-02 10:04:41,32.0,2014-08-02 10:04:54,32.0,...,2014-08-02 10:14:37,32.0,2014-08-02 10:33:29,32.0,2014-08-02 10:33:42,523.0,2014-08-02 10:33:42,521.0,2014-08-02 10:33:42,786
9003,9004,82,2014-03-29 11:06:25,11.0,2014-03-29 11:06:25,268.0,2014-03-29 11:06:30,268.0,2014-03-29 11:06:31,265.0,...,2014-03-29 11:06:31,265.0,2014-03-29 11:06:32,49.0,2014-03-29 11:06:32,8.0,2014-03-29 11:06:32,11.0,2014-03-29 11:06:33,786
10851,10852,32,2014-08-02 11:10:42,11583.0,2014-08-02 11:10:44,13834.0,2014-08-02 11:10:45,11.0,2014-08-02 11:10:45,625.0,...,2014-08-02 11:10:45,184.0,2014-08-02 11:10:46,11.0,2014-08-02 11:10:46,13834.0,2014-08-02 11:10:46,625.0,2014-08-02 11:10:46,786
13176,13177,322,2014-05-13 15:09:17,322.0,2014-05-13 15:29:13,322.0,2014-05-13 15:29:19,,,,...,,,,,,,,,,786
14755,14756,32,2014-08-02 11:07:46,1980.0,2014-08-02 11:07:47,111.0,2014-08-02 11:07:47,7.0,2014-08-02 11:07:48,55.0,...,2014-08-02 11:07:48,38.0,2014-08-02 11:07:48,88.0,2014-08-02 11:07:48,111.0,2014-08-02 11:07:48,78.0,2014-08-02 11:07:49,786


In [76]:
test_data[test_data.duplicated(subset=['time' + str(c) for c in range(1,session_length+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+['time' + str(c) for c in range(1,10+1)]]

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
868,38,0,0,0,0,0,0,0,0,0,2013-11-28 13:36:53,0,0,0,0,0,0,0,0,0
2694,27,0,0,0,0,0,0,0,0,0,2014-02-17 17:01:28,0,0,0,0,0,0,0,0,0
3840,869,0,0,0,0,0,0,0,0,0,2014-03-18 08:18:39,0,0,0,0,0,0,0,0,0
6829,307,0,0,0,0,0,0,0,0,0,2014-08-04 16:15:38,0,0,0,0,0,0,0,0,0
7492,27,9,0,0,0,0,0,0,0,0,2013-11-21 14:08:05,2013-11-21 14:08:05,0,0,0,0,0,0,0,0
7985,869,0,0,0,0,0,0,0,0,0,2014-03-16 18:48:02,0,0,0,0,0,0,0,0,0
8388,58,25,0,0,0,0,0,0,0,0,2013-12-18 10:18:28,2013-12-18 10:18:28,0,0,0,0,0,0,0,0
9065,1945,0,0,0,0,0,0,0,0,0,2014-02-21 15:17:45,0,0,0,0,0,0,0,0,0
10144,9,0,0,0,0,0,0,0,0,0,2014-03-25 08:34:39,0,0,0,0,0,0,0,0,0
10833,27,9,0,0,0,0,0,0,0,0,2013-11-21 14:08:05,2013-11-21 14:08:05,0,0,0,0,0,0,0,0


In [287]:
feature_names = ['site' + str(i) for i in range(1,11)] + ['time' + str(i) for i in range(1,11)] + \
                ['time_diff' + str(j) for j in range(1,10)] + \
                ['session_timespan', '#unique_sites', 'site_longest_time', 'start_hour', 'day_of_week', 'daytime', 'fb_portion',\
                 'youtube_portion', 'top30_portion', 'bot30_portion', 'prediction', 'target']

In [288]:
# Find duplicates
def delete_dups_max(csv_files_mask, feature_names, site_freq_path="", site_index_path="", dataframe_csv="",
                                    session_length=10, window_size=10, session_time = 30, num_users_for_prediction = 1, sort_in_session=False):
    files = glob(csv_files_mask)
    
    used = {}
    
    for pair in pairs:
        pair_data = pd.DataFrame()
        counter = 1
        
        if pair[0] not in used or pair[1] not in used:
            files = ['kaggle_data/train/user'+pair[0]+'.csv', 'kaggle_data/train/user'+pair[1]+'.csv']
            counter = 1

            for userfile in files:
                user_id = re.search('user(\d+)\.csv', userfile).group(1)
                data = pd.read_csv(userfile, \
                                   parse_dates=[0], infer_datetime_format=True, names=["timestamp"+str(counter), "site"+str(counter)], header=0)
                data.sort_values("timestamp"+str(counter), inplace=True)
                data.reset_index(drop=True, inplace=True)
                pair_data = pd.concat([pair_data, data], axis=1)
                counter +=1
            
            pair_data = pair_data[(pair_data.timestamp1 != pair_data.timestamp2) & \
                                  (pair_data.site1 != pair_data.site2)]

            data1 = pair_data[["timestamp1", "site1"]].dropna().copy()
            data2 = pair_data[["timestamp2", "site2"]].dropna().copy()

            if pair[0] not in used:
                data1.to_csv('kaggle_data/new_train/user'+pair[0]+'.csv', index=False, header=["timestamp", "site"])
                used[pair[0]] = 0
            if pair[1] not in used:
                data2.to_csv('kaggle_data/new_train/user'+pair[1]+'.csv', index=False, header=["timestamp", "site"])
                used[pair[1]] = 0
    return None

In [289]:
delete_dups_max('kaggle_data/train/*',
                                   feature_names=feature_names, 
                                            site_freq_path="kaggle_data/site_freq.pkl", session_length=10, sort_in_session=True)

In [218]:
# Find duplicates
def delete_dups_medium(csv_files_mask, feature_names, site_freq_path="", site_index_path="", dataframe_csv="",
                                    session_length=10, window_size=10, session_time = 30, num_users_for_prediction = 1, sort_in_session=False):
    files = glob(csv_files_mask)
    
    used = {}
    
    for pair in pairs:
        pair_data = pd.DataFrame()
        counter = 1
        
        if pair[0] not in used or pair[1] not in used:
            files = ['kaggle_data/train/user'+pair[0]+'.csv', 'kaggle_data/train/user'+pair[1]+'.csv']
            counter = 1

            for userfile in files:
                user_id = re.search('user(\d+)\.csv', userfile).group(1)
                data = pd.read_csv(userfile, \
                                   parse_dates=[0], infer_datetime_format=True, names=["timestamp"+str(counter), "site"+str(counter)], header=0)
                data.sort_values("timestamp"+str(counter), inplace=True)
                data.reset_index(drop=True, inplace=True)

                data["weekday"+str(counter)] = data["timestamp"+str(counter)].apply(lambda x: int(x.isoweekday()))
                data["hour"+str(counter)] = data["timestamp"+str(counter)].apply(lambda x: int(x.hour))
                pair_data = pd.concat([pair_data, data], axis=1)
                counter +=1
            
            pair_data = pair_data[(pair_data.weekday1 != pair_data.weekday2) & \
                                  (pair_data.site1 != pair_data.site2) & (pair_data.hour1 != pair_data.hour2)]

            data1 = pair_data[["timestamp1", "site1"]].dropna().copy()
            data2 = pair_data[["timestamp2", "site2"]].dropna().copy()

            if pair[0] not in used:
                data1.to_csv('kaggle_data/new_train/user'+pair[0]+'.csv', index=False, header=["timestamp", "site"])
                used[pair[0]] = 0
            if pair[1] not in used:
                data2.to_csv('kaggle_data/new_train/user'+pair[1]+'.csv', index=False, header=["timestamp", "site"])
                used[pair[1]] = 0
    return None

In [217]:
delete_dups_medium('kaggle_data/train/*',
                                   feature_names=feature_names, 
                                            site_freq_path="kaggle_data/site_freq.pkl", session_length=10, sort_in_session=True)

In [416]:
# Find duplicates
def delete_dups_minimal2(csv_files_mask, feature_names, site_freq_path="", site_index_path="", dataframe_csv="",
                                    session_length=10, window_size=10, session_time = 30, num_users_for_prediction = 1, sort_in_session=False):
    files = glob(csv_files_mask)
    pair_data = pd.DataFrame()
    
    counter = 1
    
    used = {}
    
    for pair in pairs:
        
        if pair[0] not in used or pair[1] not in used:
            files = ['kaggle_data/train/user'+pair[0]+'.csv', 'kaggle_data/train/user'+pair[1]+'.csv']
            counter = 1

            for userfile in files:
                user_id = re.search('user(\d+)\.csv', userfile).group(1)
                data = pd.read_csv(userfile, \
                                   parse_dates=[0], infer_datetime_format=True)
                data.sort_values("timestamp", inplace=True)
                data.reset_index(drop=True, inplace=True)
                data["target"] = int(user_id)
                pair_data = pd.concat([pair_data, data], axis=0)
                counter +=1

            #pair_data = pair_data[(pair_data.timestamp1 != pair_data.timestamp2) & (pair_data.site1 != pair_data.site2)]
            pair_data.drop(pair_data[pair_data.duplicated(subset=["site", "timestamp"], keep=False)].index, inplace=True)


            data1 = pair_data[pair_data.target == int(pair[0])][["timestamp", "site"]].copy()
            data2 = pair_data[pair_data.target == int(pair[1])][["timestamp", "site"]].copy()

            if pair[0] not in used:
                data1.to_csv('kaggle_data/new_train/user'+pair[0]+'.csv', index=False)
                used[pair[0]] = 0
            if pair[1] not in used:
                data2.to_csv('kaggle_data/new_train/user'+pair[1]+'.csv', index=False)
                used[pair[1]] = 0                          
    return None

In [417]:
delete_dups_minimal2('kaggle_data/train/*',
                                   feature_names=feature_names, 
                                            site_freq_path="kaggle_data/site_freq.pkl", session_length=10, sort_in_session=True)

In [None]:
# Find duplicates
def delete_dups_minimal(csv_files_mask, feature_names, site_freq_path="", site_index_path="", dataframe_csv="",
                                    session_length=10, window_size=10, session_time = 30, num_users_for_prediction = 1, sort_in_session=False):
    files = glob(csv_files_mask)
    pair_data = pd.DataFrame()
    
    counter = 1
    
    used = {}
    
    for pair in pairs:
        print pair
        
        if pair[0] not in used or pair[1] not in used:
            files = ['kaggle_data/train_toy/user'+pair[0]+'.csv', 'kaggle_data/train_toy/user'+pair[1]+'.csv']
            counter = 1

            for userfile in files:
                user_id = re.search('user(\d+)\.csv', userfile).group(1)
                data = pd.read_csv(userfile, \
                                   parse_dates=[0], infer_datetime_format=True)
                data.sort_values("timestamp", inplace=True)
                data.reset_index(drop=True, inplace=True)
                data["weekday"] = data.timestamp.apply(lambda x: int(x.isoweekday()))
                data["hour"] = data.timestamp.apply(lambda x: int(x.hour))
                data["target"] = int(user_id)
                print user_id, len(data)
                pair_data = pd.concat([pair_data, data], axis=0)
                counter +=1

            #pair_data = pair_data[(pair_data.timestamp1 != pair_data.timestamp2) & (pair_data.site1 != pair_data.site2)]
            pair_data.drop(pair_data[pair_data.duplicated(subset=["site", "weekday", "hour"], keep=False)].index, inplace=True)
            print len(pair_data)

            data1 = pair_data[pair_data.target == int(pair[0])][["timestamp", "site"]].copy()
            data2 = pair_data[pair_data.target == int(pair[1])][["timestamp", "site"]].copy()

            if pair[0] not in used:
                print 'im here'
                data1.to_csv('kaggle_data/new_train/user'+pair[0]+'.csv', index=False)
                used[pair[0]] = 0
            if pair[1] not in used:
                data2.to_csv('kaggle_data/new_train/user'+pair[1]+'.csv', index=False)
                used[pair[1]] = 0                          

            print used

            break
    return None

In [190]:
delete_dups_minimal('kaggle_data/train_toy/*',
                                   feature_names=feature_names, 
                                            site_freq_path="kaggle_data/site_freq.pkl", session_length=10, sort_in_session=True)

['2631', '786']
2631 357
786 357
4
im here
{'786': 0, '2631': 0}


In [151]:
dataf[(dataf.timestamp1 != dataf.timestamp2) & (dataf.site1 != dataf.site2)]

Unnamed: 0,timestamp1,site1,timestamp2,site2
9,2014-01-25 10:29:31,fpdownload2.macromedia.com,2014-01-25 10:29:38,www.iegallery.com
10,2014-01-25 10:29:33,go.microsoft.com,2014-01-25 10:29:39,www.iegallery.com
12,2014-01-25 10:29:43,www.iegallery.com,2014-01-25 10:29:47,www.google.fr
13,2014-01-25 10:29:47,www.google.fr,2014-01-25 10:29:48,www.google.com
15,2014-01-25 10:29:48,www.google.fr,2014-01-25 10:30:05,www.google.com
17,2014-01-25 10:30:05,www.google.com,2014-01-25 10:30:10,www.google.fr
19,2014-01-25 10:30:10,ajax.googleapis.com,2014-01-25 10:36:59,office14client.microsoft.com
21,2014-01-25 10:36:59,office14client.microsoft.com,2014-01-25 10:41:10,rr.office.microsoft.com
22,2014-01-25 10:41:10,api.bing.com,2014-01-25 10:41:11,integrate.factiva.com
23,2014-01-25 10:41:10,rr.office.microsoft.com,2014-01-25 10:41:12,www.microsofttranslator.com


In [142]:
dataf

Unnamed: 0,timestamp1,site1,timestamp2,site2
0,2014-01-17 14:31:24,fpdownload2.macromedia.com,2014-01-17 14:31:24,fpdownload2.macromedia.com
1,2014-01-17 14:31:27,js.microsoft.com,2014-01-17 14:31:27,windows.microsoft.com
2,2014-01-17 14:31:27,go.microsoft.com,2014-01-17 14:31:27,res2.windows.microsoft.com
3,2014-01-17 14:31:27,ajax.microsoft.com,2014-01-17 14:31:27,js.microsoft.com
4,2014-01-17 14:31:31,www.google.com,2014-01-17 14:31:31,www.google.com
5,2014-01-17 14:32:43,ieonline.microsoft.com,2014-01-17 14:32:43,ieonline.microsoft.com
6,2014-01-17 14:36:02,dl.javafx.com,2014-01-17 14:36:02,dl.javafx.com
7,2014-01-17 14:38:34,www.google.fr,2014-01-17 14:38:34,www.google.fr
8,2014-01-17 14:38:48,www.google.fr,2014-01-17 14:38:48,www.google.fr
9,2014-01-25 10:29:31,fpdownload2.macromedia.com,2014-01-25 10:29:38,www.iegallery.com


# Submission

In [353]:
!shuf {folder}train{handler}.vw -o {folder}train{handler}.vw

In [355]:
# Quick
!vw --oaa=550 -d {folder}train{handler}.vw \
-f {folder}initial_model{handler}.model -b 28 -c -k \
--passes=5 -l 0.45 --decay_learning_rate=0.9 \
--cubic "sbc" \
--keep "s" --keep "b" --keep "c"

!vw -i {folder}initial_model{handler}.model  -t -d {folder}valid{handler}.vw \
-p {folder}vw_valid_pred{handler}.csv --quiet

vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
accuracy = accuracy_score(y_valid, vw_valid_pred.values)
print "Accuracy:", accuracy

creating cubic features for triples: sbc 
using namespaces beginning with: s b c 
final_regressor = kaggle_data/initial_model_idf_w8_seq.model
Num weight bits = 28
learning rate = 0.45
initial_t = 0
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = kaggle_data/train_idf_w8_seq.vw.cache
Reading datafile = kaggle_data/train_idf_w8_seq.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0      175        1       11
1.000000 1.000000            2            2.0      530      175       61
1.000000 1.000000            4            4.0       53      175       11
1.000000 1.000000            8            8.0      545       53       21
1.000000 1.000000           16           16.0      390      545       21
0.968750 0.937500           32           32.0      349      545       21
0.953125 0.937500           64           64.0     

In [410]:
%%time
!vw --oaa=550 -d {folder}train{handler}.vw \
-f {folder}initial_model{handler}.model -b 28 -c -k \
--passes=30 --decay_learning_rate 0.9 --initial_t 0.002337045080352835 \
-l 0.5416950450219994 \
--power_t 0.5 --loss_function='logistic' --l1 1e-11 --l2 1e-11 \
--cubic="sbc"  \
--keep "s" --keep "b" --keep "c" --keep "l" --keep "a" \
--stage_poly --batch_sz {len(train_file)/6} --batch_sz_no_doubling

creating cubic features for triples: sbc 
using namespaces beginning with: s b c l a 
using l1 regularization = 1e-11
using l2 regularization = 1e-11
final_regressor = kaggle_data/initial_model_idf_w8_pred.model
Num weight bits = 28
learning rate = 0.541695
initial_t = 0.00233705
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = kaggle_data/train_idf_w8_pred.vw.cache
Reading datafile = kaggle_data/train_idf_w8_pred.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            2            2.0       50      282       33
1.000000 1.000000            4            4.0      367      282       12
1.000000 1.000000            9            9.0      531      282       16
1.000000 1.000000           19           18.9      326      367       18
1.000000 1.000000           39           38.8      386      531       12
1.000000 1.000000           78         

In [411]:
# Prediction on VALID:
!vw -i {folder}initial_model{handler}.model  -t -d {folder}valid{handler}.vw \
-p {folder}vw_valid_pred{handler}.csv --quiet

vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
accuracy = accuracy_score(y_valid, vw_valid_pred.values)
print "Accuracy:", accuracy

Accuracy: 0.801177080474


In [356]:
# Prediction on TEST:
!vw -i {folder}initial_model{handler}.model  -t -d {folder}test{handler}.vw \
-p {folder}vw_test_pred{handler}.csv

vw_test_pred = pd.read_csv(folder+'vw_test_pred'+handler+'.csv', header=None)
t_submission = pd.DataFrame(vw_test_pred.astype(int)-1)
vw_subm = class_encoder.inverse_transform(t_submission)

creating cubic features for triples: sbc 
only testing
predictions = kaggle_data/vw_test_pred_idf_w8_seq.csv
Num weight bits = 28
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = kaggle_data/test_idf_w8_seq.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0  unknown      328        5
1.000000 1.000000            2            2.0  unknown      517       19
1.000000 1.000000            4            4.0  unknown      168        9
1.000000 1.000000            8            8.0  unknown        1      495
1.000000 1.000000           16           16.0  unknown      328       19
1.000000 1.000000           32           32.0  unknown      460        9
1.000000 1.000000           64           64.0  unknown      150       25
1.000000 1.000000          128          128.0  unknown      167       75
1.000000 1.

In [357]:
vw_subm

array([['2861'],
       ['783'],
       ['2336'],
       ..., 
       ['1949'],
       ['839'],
       ['1845']], dtype=object)

In [358]:
write_to_submission_file(vw_subm,
             'kaggle_data/46vw_submission_idf_w8_seq.csv')
print "Finished creating submission.\n"

Finished creating submission.



In [29]:
%%time
#9
!vw --oaa=550 -d {folder}train{handler}.vw \
-f {folder}initial_model{handler}.model -b 28 -c -k \
--passes=5 -l 0.45 --decay_learning_rate=0.9 --l1=4e-8 --l2=4e-8 \
-q "sd" -q "sb" --cubic="sbc" --holdout_period 5 --early_terminate 2
#--keep "s" --keep "b" --keep "c" --keep "d" --keep "a"

!vw -i {folder}initial_model{handler}.model  -t -d {folder}test{handler}.vw \
-p {folder}vw_test_pred{handler}.csv --quiet

vw_test_pred = pd.read_csv(folder+'vw_test_pred'+handler+'.csv', header=None)
t_submission = pd.DataFrame(vw_test_pred.astype(int)-1)
vw_subm = class_encoder.inverse_transform(t_submission)


creating quadratic features for pairs: sd sb 
creating cubic features for triples: sbc 
using l1 regularization = 4e-08
using l2 regularization = 4e-08
final_regressor = kaggle_data/initial_model_idf_w8_bal.model
Num weight bits = 28
learning rate = 0.45
initial_t = 0
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = kaggle_data/train_idf_w8_bal.vw.cache
Reading datafile = kaggle_data/train_idf_w8_bal.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0      330        1       33
1.000000 1.000000            2            2.0      180      330       12
1.000000 1.000000            4            4.0      349       54       23
1.000000 1.000000            8            8.0      198      330       25
1.000000 1.000000           16           16.0      548      433       33
1.000000 1.000000           32           32.0      51

In [58]:
vw_pred = pd.read_csv(folder+'vw_test_pred'+handler+'.csv', header=None)
vw_subm = class_encoder.inverse_transform(vw_pred-1)

In [67]:
write_to_submission_file(vw_subm,
             'kaggle_data/31vw_submission_exp.csv')
print "Finished creating submission.\n"

Finished creating submission.



In [65]:
vw_subm = np.copy(vw_subm.astype(float).astype(int).astype(str).astype(object))

In [66]:
vw_subm

array([['2885'],
       ['783'],
       ['2294'],
       ..., 
       ['3027'],
       ['3118'],
       ['2708']], dtype=object)