In [1]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

import re
import pickle
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, make_scorer
from vowpalwabbit.sklearn_vw import VWClassifier, VW
import itertools
from sklearn.decomposition import NMF, TruncatedSVD

In [2]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from scipy.sparse import csr_matrix, hstack
import imblearn
from glob import glob

In [3]:
def sparsematrix(X):
    row = []
    col = []
    data = []
    for r in range(X.shape[0]):
        row_counter = Counter(X[r])
        for site, num in row_counter.items():
            row.append(r)
            col.append(site)
            data.append(num)
    print "Sparse Matrix - rows:", X.shape[0], "columns:", len(set(col))
    return csr_matrix((data, (row, col)), shape=(X.shape[0], len(set(col))))[:,1:]


def sites_to_sparse_tfidf(train_data, test_data, target_col, session_length, label_encoder=False):
    train_test_df = pd.concat([train_data, test_data])
    train_index_full = list(train_data.index)
    train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]].index)
    test_index_full = list(test_data.index)
    test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]].index)
    train_duplicates_mask = np.transpose([np.in1d(train_index_full, train_index_dup).astype(int)])
    test_duplicates_mask = np.transpose([np.in1d(test_index_full, test_index_dup).astype(int)])

    y = train_data[target_col]

    train_test_df_sites = train_test_df[['site' + str(c) for c in range(1,10+1)]].fillna(0).astype('int')
    train_test_df_sites_array = [" ".join(["s_"+str(s) for s in train_test_df_sites.as_matrix()[i] if int(s) != 0]) \
                                                                  for i in range(train_test_df_sites.shape[0])]

    tfidf = TfidfVectorizer(max_df=0.9).fit(train_test_df_sites_array) #TfidfVectorizer()
    X_train_test_sparse = tfidf.transform(train_test_df_sites_array)

    X_train_sparse = X_train_test_sparse[:len(train_data)]
    X_test_sparse = X_train_test_sparse[len(train_data):]
    
    sites_columns_num = X_train_test_sparse.shape[1]
    
    y_for_vw = None
    class_encoder = None
    if label_encoder:
        class_encoder = LabelEncoder().fit(y.astype('str'))
        y_for_vw = class_encoder.transform(y.astype('str')) + 1
    
    return [X_train_sparse, X_test_sparse, y, y_for_vw, sites_columns_num, class_encoder, tfidf, \
             train_duplicates_mask, test_duplicates_mask]


def features_to_sparse(train_data, test_data, feature_cols):
    features_matrix = []
    for df in [train_data, test_data]:
        num_cols = 0
        data = []
        rows = []
        cols = []
        for label in feature_cols:
            if label in ["day_of_week", "daytime"]:
                coldata = list(df[[label]].values.T[0].astype('float') + 1)
            else:
                coldata = list(df[[label]].values.T[0].astype('float'))
            if len(data):
                data += coldata
            else:
                data = list(coldata)
            if len(cols):
                cols += [num_cols] * len(coldata)
            else:
                cols = [num_cols] * len(coldata)
            num_cols += 1
        rows = [r for r in range(df.shape[0])] * num_cols
        features = csr_matrix((data, (rows, cols)), shape=(df.shape[0], num_cols), dtype=float)
        features_matrix.append(features)
    return features_matrix


def calc_site_times_portions(train_data, test_data):
    site_times = [{},{}]
    count = 0
    for data in [train_data, test_data]:
        for r, row in data[:][range(0, 10)+range(20,30)].iterrows():
            rowdic = {}
            for c, s in [[c, 'site' + str(c)] for c in range(1,10)]:
                if row[s] == 0:
                    continue
                if row[s] in rowdic:
                    rowdic[int(row[s])] += row["time_diff"+str(c)]
                else:
                    rowdic[int(row[s])] = row["time_diff"+str(c)]
            site_times[count][r] = {}
            for site, time in rowdic.items():
                if len(rowdic) == 1:
                    site_times[count][r][int(site)] = 1.0
                    continue
                if time > 0:
                    site_times[count][r][int(site)] = round(float(time)/row["session_timespan"],3)
        count+=1
    return site_times

def site_times_to_sparse(sitetimes):
    row = []
    col = []
    data = []
    rowcount = 0
    for sitetime in sitetimes:
        for r, sites in sitetime.items():
            for site, p in sites.items():
                col.append(site)
                row.append(rowcount)
                data.append(p)
            rowcount+=1
    site_times_sparse = csr_matrix((data, (row, col)), shape=(len(sitetimes[0])+len(sitetimes[1]), max(col)+1), \
                                                                                              dtype=float)[:,1:]
    return site_times_sparse



def combine_sites_features_sparse(sites_train_sparse, features_train_sparse, \
                                  sites_test_sparse, features_test_sparse, \
                                  train_duplicates_mask=None, test_duplicates_mask=None, \
                                  train_site_times_sparse = None, test_site_times_sparse = None, \
                                train_sites_sequence=None, test_sites_sequence=None):
    if train_site_times_sparse is not None and test_site_times_sparse is not None:
        X_train_sparse = hstack([sites_train_sparse, features_train_sparse, \
                                 train_site_times_sparse, train_sites_sequence], dtype=float).tocsr()
        X_test_sparse = hstack([sites_test_sparse, features_test_sparse, \
                                test_site_times_sparse, test_sites_sequence], dtype=float).tocsr()
    else:
        X_train_sparse = hstack([sites_train_sparse, features_train_sparse], dtype=float).tocsr()
        X_test_sparse = hstack([sites_test_sparse, features_test_sparse], dtype=float).tocsr()
        
    #X_train_sparse = hstack([X_train_sparse, train_duplicates_mask], dtype=float).tocsr()
    #X_test_sparse = hstack([X_test_sparse, test_duplicates_mask], dtype=float).tocsr() 
    return [X_train_sparse, X_test_sparse]


def sparse_matrix_to_vw(X_sparse, sites_columns_num, vocabulary, y=None, weights=None, mark_duplicates=False, mycolumns=[]):
    sessions = {}
    used = {}
    prediction = {}
    day_of_week = {}
    start_hour = {}
    daytime = {}
    unique_sites = {}
    top30_portion = {}
    fb_portion = {}
    youtube_portion = {}
    bot30_portion = {}
    site_longest_time = {}
    session_timespan = {}
    sitetimes = {}
    sequence = {}
    
    lables = {}
    lable_weights = {}
    
    #X_sparse = X_sparse_full[:,:-1]
    
    add_features = True

    for r, c in zip(X_sparse.nonzero()[0], X_sparse.nonzero()[1]):
        if tuple([r,c]) not in used:
            used[tuple([r, c])] = 1
            if add_features:
                if c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("prediction"):
                    prediction[r] = " |aprediction {}:{}".format(int(X_sparse[r,c]), 100)
                    #prediction[r] = " |prediction:100 {}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("day_of_week"):
                    day_of_week[r] = " |bday_of_week {}".format(int(X_sparse[r,c]))
                    #day_of_week[r] = " day_of_week:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("start_hour"):
                    start_hour[r] = " |chour_start {}".format(int(X_sparse[r,c]))
                    #start_hour[r] = " start_hour:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("daytime"):
                    daytime[r] = " |dtime_of_day {}".format(int(X_sparse[r,c]))
                    #daytime[r] = " daytime:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("session_timespan"):
                    session_timespan[r] = " |jsession_timespan time:{}".format(int(X_sparse[r,c]))
                    #session_timespan[r] = " session_timespan:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("#unique_sites"):
                    unique_sites[r] = " unique_sites:{}".format(int(X_sparse[r,c]))
                    #unique_sites[r] = " unique_sites:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("site_longest_time"):
                    site_longest_time[r] = " |hsite_longest_time {}:{}".format(int(X_sparse[r,c]), 3)
                    #site_longest_time[r] = " site_longest_time:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("top30_portion"):
                    top30_portion[r] = " top30:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("bot30_portion"):
                    bot30_portion[r] = " bot30:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("fb_portion"):
                    fb_portion[r] = " facebook:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("youtube_portion"):
                    youtube_portion[r] = " youtube:{}".format(X_sparse[r,c])
                    continue
                    
            if c < X_sparse.shape[1] - len(mycolumns): #sites_columns_num: #
                if r in sessions:
                    sessions[r] += " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
                else:
                    if y is not None:
                        sessions[r] = ' |site' + " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
                        lables[r] = str(y[r])
                        if weights is not None:
                            lable_weights[r] = str(weights[y[r]-1])
                    else:
                        sessions[r] = ' |site' + " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
            #elif c > X_sparse.shape[1] - sites_columns_num and c < X_sparse.shape[1] - 10:
                #if r in sitetimes:
                    #sitetimes[r] += " {}:{}".format(int(c - sites_columns_num - len(mycolumns)+1), float(X_sparse[r,c]))
                #else:
                    #sitetimes[r] = ' |isitetime' + " {}:{}".format(int(c - sites_columns_num - len(mycolumns)+1), float(X_sparse[r,c]))
        
    
    return {"sites": sessions, "lables": lables, "lable_weights": lable_weights, "prediction": prediction, "day_of_week": day_of_week, \
                      "start_hour": start_hour, "daytime": daytime, \
                     "unique_site": unique_sites, "top30_portion": top30_portion, \
                    "bot30_portion": bot30_portion, "fb_portion": fb_portion, \
                    "youtube_portion": youtube_portion, "site_longest_time": site_longest_time, \
                    "session_timespan": session_timespan, "sitetimes": sitetimes, "sequence": sequence}



def vw_to_file(sites, out_file, features={}, lables={}, lable_weights={},  quiet=True):   
    vw_writer = open(out_file, 'w')
    final_vw = {}
    gen_features = []
    
    if not quiet:
        print "Features:", features.keys()
        
    for r in sorted(sites.keys()):
        if r in lables:
            final_vw[r] = lables[r]
        else:
            final_vw[r] = ""
        if r in lable_weights:
            final_vw[r] += " {}".format(lable_weights[r])
        final_vw[r] += sites[r] #+ " |features"
        for fname, feature in features.items():
            if fname in ["youtube_portion", "fb_portion", "top30_portion", "bot30_portion", \
                                         "unique_sites"] and r in feature:
                gen_features.append(feature[r])
                continue
            if r in feature:
                final_vw[r] += feature[r]        
            
        if len(gen_features):
            final_vw[r] += " |features"
            for gf in gen_features:
                final_vw[r] += gf
        gen_features = []
        
        #if "prediction" in features and r in features["prediction"]:
            #final_vw[r] += features["prediction"][r]
        
        vw_writer.write(final_vw[r] + "\n")
        
    vw_writer.close()
    
    
def write_to_submission_file(predicted_labels, out_file,
                             target='user_id', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)



In [4]:
def calc_predictions(train_data, test_data):
    test_row_users = {}
    train_row_users = {}
    
    # Add predictions from the dataframe (based on uniquely visited site)
    for r, v in test_data[["prediction"]].iterrows():
        if int(v) != 0:
            test_row_users[r] = [int(v)]
    
    
    #Identify sessions with identical sites sequence
    train_index_full = list(train_data.index)
    train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]].index)

    test_index_full = list(test_data.index)
    test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]].index)
    
    train_user_dup_rows_dict = {}
    train_dup_row_users_dict = {}

    test_dup_rows_dict = {}
    

    sites_cols = ['site' + str(c) for c in range(1,10+1)]
    
    for r, row in train_data.iloc[train_index_dup][sites_cols+["target"]].iterrows():
        if row["target"] in train_user_dup_rows_dict:
            if tuple(row[sites_cols]) in train_user_dup_rows_dict[row["target"]]:
                train_user_dup_rows_dict[row["target"]][tuple(row[sites_cols])] += 1
            else:
                train_user_dup_rows_dict[row["target"]][tuple(row[sites_cols])] = 1 
        else:
            train_user_dup_rows_dict[row["target"]] = {tuple(row[sites_cols]): 1}

        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            train_dup_row_users_dict[tuple(row[sites_cols])].add(row["target"])
        else:
            train_dup_row_users_dict[tuple(row[sites_cols])] = set([row["target"]])

    for r, row in test_data.iloc[test_index_dup][sites_cols].iterrows():  
        if tuple(row[sites_cols]) in test_dup_rows_dict:
            test_dup_rows_dict[tuple(row[sites_cols])] += 1
        else:
            test_dup_rows_dict[tuple(row[sites_cols])] = 1

        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            if r in test_row_users:
                pass #don't overwright predictions from the dataframe
                #test_row_users[r] += train_dup_row_users_dict[tuple(row[sites_cols])]
            else:
                test_row_users[r] = train_dup_row_users_dict[tuple(row[sites_cols])]
        
    # Find users who visited 2 websites
    site_pairs = {}
    for r, row in train_data[sites_cols+["target"]].iterrows():
        unique_sites = Counter(row).keys()
        if 0 in unique_sites:
            del unique_sites[unique_sites.index(0)]
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
        if len(unique_sites) > 3:
            for subset in itertools.permutations(Counter(row).keys(), 4):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
    
    
    # Add predictions to test data based on 2 visited websites
    for r, row in test_data[sites_cols].iterrows():
        unique_sites = Counter(row).keys()
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if subset in site_pairs:
                    if r in test_row_users:
                        pass
                    else:
                        test_row_users[r] = list(site_pairs[subset])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if subset in site_pairs:
                    if r in test_row_users:
                        pass
                    else:
                        test_row_users[r] = list(site_pairs[subset])
        if len(unique_sites) > 3:
            for subset in itertools.permutations(Counter(row).keys(), 4):
                if subset in site_pairs:
                    if r in test_row_users:
                        pass
                    else:
                        test_row_users[r] = list(site_pairs[subset])
        
    
    
    return test_row_users, site_pairs

In [5]:
def text_classifier(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
        )


# Let's Start

In [119]:
%%time
#1
#train_data = pd.read_csv('kaggle_data/full_train_w8.csv')
train_data = pd.read_csv("kaggle_data/full_train_wvar.csv")
test_data = pd.read_csv('kaggle_data/full_test.csv')

CPU times: user 4.99 s, sys: 976 ms, total: 5.96 s
Wall time: 11 s


In [100]:
from imblearn.under_sampling import     CondensedNearestNeighbour 

In [120]:
sorted(Counter(train_data.target).items(), key = lambda t: t[1])

[(344, 502),
 (304, 924),
 (2653, 928),
 (3113, 984),
 (3370, 987),
 (383, 989),
 (860, 1022),
 (1737, 1030),
 (1536, 1066),
 (2336, 1081),
 (2824, 1081),
 (823, 1086),
 (3253, 1097),
 (2401, 1098),
 (231, 1116),
 (2159, 1192),
 (2991, 1226),
 (106, 1239),
 (902, 1248),
 (2953, 1278),
 (1985, 1278),
 (1817, 1280),
 (2681, 1318),
 (1076, 1327),
 (625, 1344),
 (791, 1351),
 (2835, 1389),
 (400, 1391),
 (285, 1393),
 (1373, 1412),
 (3320, 1428),
 (1845, 1443),
 (471, 1444),
 (2150, 1458),
 (2366, 1460),
 (2971, 1471),
 (1626, 1472),
 (2775, 1497),
 (197, 1502),
 (2377, 1503),
 (3324, 1508),
 (475, 1511),
 (1826, 1512),
 (2258, 1514),
 (336, 1515),
 (3128, 1520),
 (2317, 1526),
 (1573, 1528),
 (704, 1532),
 (2790, 1548),
 (2549, 1551),
 (378, 1555),
 (2003, 1555),
 (1156, 1556),
 (3333, 1556),
 (2773, 1560),
 (1830, 1570),
 (754, 1581),
 (3342, 1583),
 (2434, 1585),
 (1113, 1585),
 (286, 1587),
 (1657, 1587),
 (2874, 1589),
 (1769, 1590),
 (2515, 1593),
 (1103, 1598),
 (1614, 1614),
 (2247

In [273]:
%%time
#2
#train_site_sequence = csr_matrix(train_data[['site' + str(c) for c in range(1,10+1)]].as_matrix(), dtype=int)
#test_site_sequence = csr_matrix(test_data[['site' + str(c) for c in range(1,10+1)]].as_matrix(), dtype=int)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 19.1 µs


In [274]:
%%time
#3
# Additionally, let's calculate the percentage of session time spent by every site in session
#site_times = calc_site_times_portions(train_data, test_data)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 16 µs


In [275]:
%%time
#4
# Convert site times to sparse format
#site_times_sparse = site_times_to_sparse(site_times)
#train_site_times_sparse = site_times_sparse[:len(train_data)]
#test_site_times_sparse = site_times_sparse[len(train_data):]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


In [121]:
%%time
#5
train_test_df = pd.concat([train_data, test_data])

session_length = 10
#train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                       #[['site' + str(c) for c in range(1,10+1)]+["target"]].index)

#test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                       #[['site' + str(c) for c in range(1,10+1)]].index)
#train_duplicates_mask = np.transpose([np.in1d(train_index_full, train_index_dup).astype(int)])
#test_duplicates_mask = np.transpose([np.in1d(test_index_full, test_index_dup).astype(int)])

y = train_data["target"]

train_test_df_sites = train_test_df[['site' + str(c) for c in range(1,10+1)]].fillna(0).astype('int')
train_test_df_sites_array = [" ".join(["s_"+str(s) for s in train_test_df_sites.as_matrix()[i] if int(s) != 0]) \
                                                              for i in range(train_test_df_sites.shape[0])]

tfidf = TfidfVectorizer(analyzer=str.split, max_df=1.0, ngram_range=(1,3)).fit(train_test_df_sites_array) #TfidfVectorizer()
X_train_test_sparse = tfidf.transform(train_test_df_sites_array)

X_train_sparse = X_train_test_sparse[:len(train_data)]
X_test_sparse = X_train_test_sparse[len(train_data):]

class_encoder = LabelEncoder().fit(y.astype('str'))
y_for_vw = class_encoder.transform(y.astype('str')) + 1

sites_columns_num = X_train_test_sparse.shape[1]
inv_vocabulary = {v: int(re.search("s_(\d+)$", k).group(1)) for k, v in tfidf.vocabulary_.iteritems()}
y_weights = [1.0] * 550

CPU times: user 36.9 s, sys: 220 ms, total: 37.1 s
Wall time: 37.3 s


In [122]:
%%time
#6
mycolumns = [label for label in test_data[range(20, test_data.shape[1])]]

train_features, test_features = features_to_sparse(train_data, test_data, mycolumns)

X_train_sparse, X_test_sparse = combine_sites_features_sparse(X_train_sparse, train_features, \
                                                             X_test_sparse, test_features)#, \
                                                              #train_duplicates_mask, test_duplicates_mask,
                                                              #train_site_times_sparse, test_site_times_sparse, \
                                                             #train_site_sequence, test_site_sequence)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_sparse, y_for_vw, test_size=0.3, stratify=y_for_vw)
y_train_weights = [1.0] * 550

CPU times: user 16.1 s, sys: 460 ms, total: 16.5 s
Wall time: 16.8 s


In [116]:
from imblearn.under_sampling import     RandomUnderSampler
from imblearn.over_sampling import     RandomOverSampler

In [118]:
cnn = RandomOverSampler(random_state=42) 
X_res, y_res = cnn.fit_sample(X_train.toarray(), y_train)

MemoryError: 

In [107]:
Counter(y_res)

Counter({1: 15,
         2: 15,
         3: 15,
         4: 15,
         5: 15,
         6: 15,
         7: 15,
         8: 15,
         9: 15,
         10: 15,
         11: 15,
         12: 15,
         13: 15,
         14: 15,
         15: 15,
         16: 15,
         17: 15,
         18: 15,
         19: 15,
         20: 15,
         21: 15,
         22: 15,
         23: 15,
         24: 15,
         25: 15,
         26: 15,
         27: 15,
         28: 15,
         29: 15,
         30: 15,
         31: 15,
         32: 15,
         33: 15,
         34: 15,
         35: 15,
         36: 15,
         37: 15,
         38: 15,
         39: 15,
         40: 15,
         41: 15,
         42: 15,
         43: 15,
         44: 15,
         45: 15,
         46: 15,
         47: 15,
         48: 15,
         49: 15,
         50: 15,
         51: 15,
         52: 15,
         53: 15,
         54: 15,
         55: 15,
         56: 15,
         57: 15,
         58: 15,
         59: 15,
      

In [124]:
%%time
#7
train_part_vw = sparse_matrix_to_vw(X_train, 0, inv_vocabulary, y_train, weights=y_train_weights, mycolumns=mycolumns)
valid_vw = sparse_matrix_to_vw(X_valid, 0, inv_vocabulary, y_valid, mycolumns=mycolumns)
train_vw = sparse_matrix_to_vw(X_train_sparse, 0, inv_vocabulary, y_for_vw, weights=y_weights, mycolumns=mycolumns)
test_vw = sparse_matrix_to_vw(X_test_sparse, 0, inv_vocabulary, mycolumns=mycolumns)

CPU times: user 30min 51s, sys: 3.1 s, total: 30min 54s
Wall time: 31min 1s


In [125]:
folder = 'kaggle_data/'
handler = '_idf_w8_wvar'

In [19]:
print "Saving vw files"
#with open(folder+'train_part'+handler+'.pkl', 'wb') as f:
    #pickle.dump(train_part_vw, f)
#with open(folder+'valid'+handler+'.pkl', 'wb') as f:
    #pickle.dump(valid_vw, f)
with open(folder+'train'+handler+'.pkl', 'wb') as f:
    pickle.dump(train_vw, f)
#with open(folder+'test'+handler+'.pkl', 'wb') as f:
    #pickle.dump(test_vw, f)
#with open(folder+'class_encoder'+handler+'.pkl', 'wb') as f:
    #pickle.dump(class_encoder, f)

#y.to_csv(folder+'y'+handler+'.csv', index=False, header=False)
#pd.DataFrame(y_train).to_csv(folder+'y_train'+handler+'.csv', index=False, header=False)
#pd.DataFrame(y_valid).to_csv(folder+'y_valid'+handler+'.csv', index=False, header=False)

Saving vw files


KeyboardInterrupt: 

In [126]:
%%time
#8

keys = ['day_of_week', 'daytime', 'prediction', 'start_hour', 'bot30_portion', 'top30_portion']

vw_to_file(train_part_vw["sites"], folder+'train_part'+handler+'.vw', \
           features={x:train_part_vw[x] for x in keys}, \
           lables=train_part_vw["lables"], lable_weights=train_part_vw["lable_weights"], quiet=True)
vw_to_file(valid_vw["sites"], folder+'valid'+handler+'.vw', features={x:valid_vw[x] for x in keys}, \
           lables=valid_vw["lables"], quiet=True)
vw_to_file(train_vw["sites"], folder+'train'+handler+'.vw', features={x:train_vw[x] for x in keys}, \
           lables=train_vw["lables"], lable_weights=train_vw["lable_weights"], quiet=True)
vw_to_file(test_vw["sites"], folder+'test'+handler+'.vw', features={x:test_vw[x] for x in keys}, quiet=True)

CPU times: user 8.1 s, sys: 212 ms, total: 8.31 s
Wall time: 8.43 s


In [46]:
class_encoder.inverse_transform(536)

'926'

In [129]:
%%time
#9
!vw --oaa=550 -d {folder}train_part{handler}.vw \
-f {folder}initial_model{handler}.model -b 27 -c -k \
--passes=5 -l 0.45 --decay_learning_rate=0.9 --l1=1e-6 --l2=1e-6 \
-q "sd" -q "sb" --cubic="sbc"
#--keep "s" --keep "b" --keep "c" --keep "d" --keep "a"

!vw -i {folder}initial_model{handler}.model  -t -d {folder}valid{handler}.vw \
-p {folder}vw_valid_pred{handler}.csv --quiet

vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
accuracy = accuracy_score(y_valid, vw_valid_pred.values)
print "Accuracy:", accuracy

creating quadratic features for pairs: sd sb 
creating cubic features for triples: sbc 
using l1 regularization = 1e-06
using l2 regularization = 1e-06
final_regressor = kaggle_data/initial_model_idf_w8_wvar.model
Num weight bits = 27
learning rate = 0.45
initial_t = 0
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = kaggle_data/train_part_idf_w8_wvar.vw.cache
Reading datafile = kaggle_data/train_part_idf_w8_wvar.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0       77        1       33
1.000000 1.000000            2            2.0       29       77       39
1.000000 1.000000            4            4.0      435       67       34
1.000000 1.000000            8            8.0      499       77       25
1.000000 1.000000           16           16.0      365      404       29
1.000000 1.000000           32          

In [20]:
!vw --oaa 550 -d {folder}train_part{handler}.vw -f {folder}initial_model{handler}.model --loss_function="logistic" \
-b 27 -c -k -q "sd" -q "sb" --cubic "sbc" -l 0.54 --decay_learning_rate=0.9 --l1=1e-11 --l2=1e-11 \
--passes 5 --stage_poly --batch_sz 12752 --batch_sz_no_doubling

creating quadratic features for pairs: sd sb 
creating cubic features for triples: sbc 
using l1 regularization = 1e-11
using l2 regularization = 1e-11
final_regressor = kaggle_data/initial_model_idf_w8_min.model
Num weight bits = 27
learning rate = 0.54
initial_t = 0
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = kaggle_data/train_part_idf_w8_min.vw.cache
Reading datafile = kaggle_data/train_part_idf_w8_min.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0       75        1       29
1.000000 1.000000            2            2.0      140       75       29
1.000000 1.000000            4            4.0      357      140       39
1.000000 1.000000            8            8.0      426      386       34
1.000000 1.000000           16           16.0      145      140       30
1.000000 1.000000           32           32

In [15]:
!vw -i {folder}initial_model{handler}.model  -t -d {folder}valid_idf_w8.vw \
-p {folder}vw_valid_pred{handler}.csv --quiet

vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
accuracy = accuracy_score(y_valid, vw_valid_pred.values)
print "Accuracy:", accuracy

ValueError: Found input variables with inconsistent numbers of samples: [31239, 32793]

## Confusion Matrix

In [70]:
countery = Counter(y_train)
confusion = {}

In [71]:
M = confusion_matrix(y_valid, vw_valid_pred)
M_normalized = M.astype('float') / M.sum(axis=1)[:, np.newaxis]
for (t,f), value in np.ndenumerate(M):
    if t != f and value > 0:
        confusion[tuple([t, f])] = value

In [72]:
[[[tf[0], countery[tf[0]+1]], [tf[1], countery[tf[1]+1]], [val]] for tf, val in sorted(confusion.items(), key=lambda t: t[1], reverse = True)]

[[[317, 784], [226, 783], [174]],
 [[226, 783], [317, 784], [145]],
 [[316, 447], [137, 442], [104]],
 [[351, 325], [235, 328], [102]],
 [[137, 442], [316, 447], [95]],
 [[248, 384], [329, 386], [92]],
 [[178, 382], [363, 383], [91]],
 [[235, 328], [351, 325], [84]],
 [[329, 386], [248, 384], [82]],
 [[363, 383], [178, 382], [80]],
 [[510, 272], [251, 273], [61]],
 [[140, 263], [266, 265], [61]],
 [[251, 273], [510, 272], [59]],
 [[405, 199], [112, 201], [58]],
 [[34, 267], [371, 269], [55]],
 [[371, 269], [34, 267], [53]],
 [[266, 265], [140, 263], [49]],
 [[539, 238], [245, 238], [46]],
 [[245, 238], [539, 238], [44]],
 [[112, 201], [405, 199], [43]],
 [[138, 163], [380, 163], [41]],
 [[116, 132], [228, 130], [35]],
 [[228, 130], [116, 132], [35]],
 [[240, 116], [82, 115], [34]],
 [[385, 2693], [468, 1586], [33]],
 [[380, 163], [138, 163], [32]],
 [[496, 99], [72, 99], [30]],
 [[154, 137], [548, 138], [30]],
 [[548, 138], [154, 137], [29]],
 [[385, 2693], [416, 465], [29]],
 [[82, 11

In [73]:
pairs = [[class_encoder.inverse_transform(tf[0]), class_encoder.inverse_transform(tf[1])] \
         for tf, val in sorted(confusion.items(), key=lambda t: t[1], reverse = True)]

In [74]:
pairs

[['2824', '2336'],
 ['2336', '2824'],
 ['2820', '1807'],
 ['2971', '2366'],
 ['1807', '2820'],
 ['2434', '2874'],
 ['2004', '3060'],
 ['2366', '2971'],
 ['2874', '2434'],
 ['3060', '2004'],
 ['762', '244'],
 ['1812', '2524'],
 ['244', '762'],
 ['3260', '1665'],
 ['1180', '3102'],
 ['3102', '1180'],
 ['2524', '1812'],
 ['940', '2414'],
 ['2414', '940'],
 ['1665', '3260'],
 ['1808', '313'],
 ['17', '2342'],
 ['2342', '17'],
 ['239', '1505'],
 ['3165', '537'],
 ['313', '1808'],
 ['692', '1413'],
 ['1877', '974'],
 ['974', '1877'],
 ['3165', '3328'],
 ['1505', '239'],
 ['280', '728'],
 ['728', '280'],
 ['2161', '2746'],
 ['3119', '178'],
 ['178', '3119'],
 ['2824', '557'],
 ['2681', '1737'],
 ['121', '538'],
 ['537', '3328'],
 ['2336', '557'],
 ['2746', '2161'],
 ['1273', '537'],
 ['3118', '412'],
 ['538', '121'],
 ['2954', '2642'],
 ['2549', '3333'],
 ['2642', '2954'],
 ['1413', '692'],
 ['704', '2681'],
 ['537', '3165'],
 ['2842', '2561'],
 ['197', '2773'],
 ['267', '1015'],
 ['1830', '3

In [42]:
class_encoder.inverse_transform(135), class_encoder.inverse_transform(299)

('1793', '2690')

In [213]:
class_encoder.transform([str(673)]) + 1

array([494])

In [51]:
train_data[train_data.target == 1793]

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,site_longest_time,start_hour,day_of_week,daytime,fb_portion,youtube_portion,top30_portion,bot30_portion,prediction,target
90013,53,47,53,20,32,8,106,106,32,32,...,32,9,2,0,0.0,0.0,0.659574,0.0,0,1793
90014,106,32,32,32,0,0,0,0,0,0,...,32,9,2,0,0.0,0.0,1.0,0.0,0,1793
90015,47,22,77,38,8,3079,7281,3079,3079,3079,...,7281,8,0,0,0.0,0.0,0.101695,0.79661,0,1793
90016,6,16,3079,77,32,32,32,7292,7271,6551,...,32,8,0,0,0.0,0.0,0.851282,0.005128,0,1793
90017,7292,7271,6551,3471,85,1930,7271,88,7,7271,...,7,8,0,0,0.571429,0.0,0.571429,0.285714,0,1793
90018,7271,7,7271,7,3471,88,11,77,22,90,...,11,8,0,0,0.986622,0.0,0.996656,0.003344,0,1793
90019,77,22,90,77,32,22,523,521,521,521,...,22,8,0,0,0.0,0.0,0.918919,0.0,0,1793
90020,523,521,526,521,524,523,523,521,523,524,...,523,8,0,0,0.0,0.0,0.0,0.0,0,1793
90021,521,523,524,85,7269,7,38,7269,7272,7269,...,524,8,0,0,0.011111,0.0,0.011111,0.022222,0,1793
90022,7269,7272,7269,7272,340,11,335,7269,7272,67,...,7269,8,0,0,0.0,0.0,0.0,1.0,0,1793


In [44]:
train_data[train_data.target == 2690]

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,site_longest_time,start_hour,day_of_week,daytime,fb_portion,youtube_portion,top30_portion,bot30_portion,prediction,target
104717,37,20,32,38,8,106,32,0,0,0,...,106,9,2,0,0.0,0.0,0.802632,0.0,0,2690
104718,17,16,53,37,22,32,20,8,38,7281,...,8,8,0,0,0.0,0.0,0.611111,0.0,0,2690
104719,8,38,7281,3079,6,16,77,32,32,14117,...,3079,8,0,0,0.0,0.0,0.229885,0.028736,2690,2690
104720,32,32,14117,32,14117,362,7292,7271,3471,1930,...,14117,8,0,0,0.0,0.0,0.185629,0.814371,2690,2690
104721,7271,3471,1930,88,11,7,7271,3471,88,90,...,88,8,0,0,0.990033,0.0,0.990033,0.006645,0,2690
104722,3471,88,90,22,32,22,32,32,521,521,...,88,8,0,0,0.899083,0.0,0.996942,0.0,0,2690
104723,32,521,521,523,526,524,526,523,523,524,...,523,8,0,0,0.0,0.0,0.0,0.0,0,2690
104724,526,523,524,85,340,7269,7,7272,7272,38,...,524,8,0,0,0.0,0.0,0.0,0.013333,0,2690
104725,7,7272,38,7269,7269,67,38,335,340,7269,...,7269,8,0,0,0.2,0.0,0.4,0.6,0,2690
104726,38,340,7269,38,7272,55,227,38,178,367,...,7269,8,0,0,0.0,0.0,0.333333,0.666667,0,2690


In [43]:
df= pd.read_csv('kaggle_data/train_sessions.csv')


In [45]:
df[df.user_id == 2631]

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,user_id
5491,5492,509,2014-03-29 11:06:24,858.0,2014-03-29 11:06:24,11.0,2014-03-29 11:06:24,65.0,2014-03-29 11:06:24,268.0,...,2014-03-29 11:06:25,82.0,2014-03-29 11:06:25,241.0,2014-03-29 11:06:31,265.0,2014-03-29 11:06:32,241.0,2014-03-29 11:06:32,2631
6571,6572,32,2014-08-02 10:04:39,32.0,2014-08-02 10:04:40,8.0,2014-08-02 10:04:41,8.0,2014-08-02 10:04:53,32.0,...,2014-08-02 10:04:55,3087.0,2014-08-02 10:14:37,521.0,2014-08-02 10:33:42,523.0,2014-08-02 10:33:42,32.0,2014-08-02 10:33:42,2631
8631,8632,65,2014-01-25 10:30:10,260.0,2014-01-25 10:36:59,412.0,2014-01-25 10:36:59,1361.0,2014-01-25 10:41:10,77.0,...,2014-01-25 10:41:10,1321.0,2014-01-25 10:41:11,1378.0,2014-01-25 10:41:11,1104.0,2014-01-25 10:41:12,2891.0,2014-01-25 10:41:12,2631
12124,12125,523,2014-08-02 10:33:43,524.0,2014-08-02 10:33:43,526.0,2014-08-02 10:33:44,32.0,2014-08-02 10:34:05,13832.0,...,2014-08-02 10:35:01,13836.0,2014-08-02 10:35:52,32.0,2014-08-02 10:35:52,13836.0,2014-08-02 10:35:53,342.0,2014-08-02 10:35:53,2631
14795,14796,32,2014-08-02 11:27:57,,,,,,,,...,,,,,,,,,,2631
15614,15615,32,2014-08-02 11:10:39,32.0,2014-08-02 11:10:42,13834.0,2014-08-02 11:10:43,11583.0,2014-08-02 11:10:44,13834.0,...,2014-08-02 11:10:45,11.0,2014-08-02 11:10:45,625.0,2014-08-02 11:10:45,184.0,2014-08-02 11:10:46,625.0,2014-08-02 11:10:46,2631
16642,16643,2889,2014-02-22 11:26:25,67.0,2014-02-22 11:27:57,7.0,2014-02-22 11:27:58,2889.0,2014-02-22 11:27:58,38.0,...,2014-02-22 11:27:58,88.0,2014-02-22 11:30:31,38.0,2014-02-22 11:30:31,7.0,2014-02-22 11:30:31,2889.0,2014-02-22 11:30:31,2631
16952,16953,544,2014-05-13 08:05:30,525.0,2014-05-13 08:05:52,15.0,2014-05-13 08:05:57,479.0,2014-05-13 08:10:54,5.0,...,2014-05-13 08:12:55,525.0,2014-05-13 08:14:13,544.0,2014-05-13 08:14:13,525.0,2014-05-13 08:14:22,14.0,2014-05-13 08:14:54,2631
20707,20708,8,2014-01-25 10:55:50,32.0,2014-01-25 10:57:02,32.0,2014-01-25 11:01:14,8.0,2014-01-25 11:01:15,32.0,...,2014-01-25 11:01:33,65.0,2014-01-25 11:01:34,13828.0,2014-01-25 11:01:34,38.0,2014-01-25 11:01:35,85.0,2014-01-25 11:01:35,2631
21597,21598,13840,2014-01-25 11:01:35,13828.0,2014-01-25 11:01:35,3430.0,2014-01-25 11:01:35,13833.0,2014-01-25 11:01:35,11.0,...,2014-01-25 11:01:36,13828.0,2014-01-25 11:01:36,8.0,2014-01-25 11:01:37,38.0,2014-01-25 11:01:37,55.0,2014-01-25 11:01:37,2631


In [46]:
df[df.user_id == 786]

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,user_id
2502,2503,1955,2014-08-02 11:06:25,38.0,2014-08-02 11:06:26,38.0,2014-08-02 11:06:27,55.0,2014-08-02 11:06:27,1955.0,...,2014-08-02 11:06:29,1955.0,2014-08-02 11:06:31,1955.0,2014-08-02 11:06:37,32.0,2014-08-02 11:07:45,1946.0,2014-08-02 11:07:46,786
3384,3385,184,2014-01-17 14:31:24,53.0,2014-01-17 14:31:27,47.0,2014-01-17 14:31:27,17.0,2014-01-17 14:31:27,8.0,...,2014-01-17 14:32:43,2141.0,2014-01-17 14:36:02,32.0,2014-01-17 14:38:34,32.0,2014-01-17 14:38:48,,,786
4684,4685,106,2014-03-29 10:03:15,106.0,2014-03-29 10:03:30,106.0,2014-03-29 10:03:31,32.0,2014-03-29 10:03:31,106.0,...,2014-03-29 10:03:34,13839.0,2014-03-29 10:07:08,13839.0,2014-03-29 10:07:10,3087.0,2014-03-29 10:23:30,,,786
5462,5463,32,2014-05-13 15:04:22,32.0,2014-05-13 15:04:23,32.0,2014-05-13 15:04:26,106.0,2014-05-13 15:04:31,32.0,...,2014-05-13 15:05:00,322.0,2014-05-13 15:05:01,322.0,2014-05-13 15:08:05,305.0,2014-05-13 15:08:08,322.0,2014-05-13 15:08:37,786
5553,5554,65,2014-01-25 11:01:34,13840.0,2014-01-25 11:01:35,13828.0,2014-01-25 11:01:35,3430.0,2014-01-25 11:01:35,88.0,...,2014-01-25 11:01:36,8.0,2014-01-25 11:01:36,55.0,2014-01-25 11:01:37,38.0,2014-01-25 11:01:37,13828.0,2014-01-25 11:01:38,786
7182,7183,32,2014-08-02 10:04:39,32.0,2014-08-02 10:04:40,8.0,2014-08-02 10:04:41,32.0,2014-08-02 10:04:54,32.0,...,2014-08-02 10:14:37,32.0,2014-08-02 10:33:29,32.0,2014-08-02 10:33:42,523.0,2014-08-02 10:33:42,521.0,2014-08-02 10:33:42,786
9003,9004,82,2014-03-29 11:06:25,11.0,2014-03-29 11:06:25,268.0,2014-03-29 11:06:30,268.0,2014-03-29 11:06:31,265.0,...,2014-03-29 11:06:31,265.0,2014-03-29 11:06:32,49.0,2014-03-29 11:06:32,8.0,2014-03-29 11:06:32,11.0,2014-03-29 11:06:33,786
10851,10852,32,2014-08-02 11:10:42,11583.0,2014-08-02 11:10:44,13834.0,2014-08-02 11:10:45,11.0,2014-08-02 11:10:45,625.0,...,2014-08-02 11:10:45,184.0,2014-08-02 11:10:46,11.0,2014-08-02 11:10:46,13834.0,2014-08-02 11:10:46,625.0,2014-08-02 11:10:46,786
13176,13177,322,2014-05-13 15:09:17,322.0,2014-05-13 15:29:13,322.0,2014-05-13 15:29:19,,,,...,,,,,,,,,,786
14755,14756,32,2014-08-02 11:07:46,1980.0,2014-08-02 11:07:47,111.0,2014-08-02 11:07:47,7.0,2014-08-02 11:07:48,55.0,...,2014-08-02 11:07:48,38.0,2014-08-02 11:07:48,88.0,2014-08-02 11:07:48,111.0,2014-08-02 11:07:48,78.0,2014-08-02 11:07:49,786


In [76]:
test_data[test_data.duplicated(subset=['time' + str(c) for c in range(1,session_length+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+['time' + str(c) for c in range(1,10+1)]]

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
868,38,0,0,0,0,0,0,0,0,0,2013-11-28 13:36:53,0,0,0,0,0,0,0,0,0
2694,27,0,0,0,0,0,0,0,0,0,2014-02-17 17:01:28,0,0,0,0,0,0,0,0,0
3840,869,0,0,0,0,0,0,0,0,0,2014-03-18 08:18:39,0,0,0,0,0,0,0,0,0
6829,307,0,0,0,0,0,0,0,0,0,2014-08-04 16:15:38,0,0,0,0,0,0,0,0,0
7492,27,9,0,0,0,0,0,0,0,0,2013-11-21 14:08:05,2013-11-21 14:08:05,0,0,0,0,0,0,0,0
7985,869,0,0,0,0,0,0,0,0,0,2014-03-16 18:48:02,0,0,0,0,0,0,0,0,0
8388,58,25,0,0,0,0,0,0,0,0,2013-12-18 10:18:28,2013-12-18 10:18:28,0,0,0,0,0,0,0,0
9065,1945,0,0,0,0,0,0,0,0,0,2014-02-21 15:17:45,0,0,0,0,0,0,0,0,0
10144,9,0,0,0,0,0,0,0,0,0,2014-03-25 08:34:39,0,0,0,0,0,0,0,0,0
10833,27,9,0,0,0,0,0,0,0,0,2013-11-21 14:08:05,2013-11-21 14:08:05,0,0,0,0,0,0,0,0


In [287]:
feature_names = ['site' + str(i) for i in range(1,11)] + ['time' + str(i) for i in range(1,11)] + \
                ['time_diff' + str(j) for j in range(1,10)] + \
                ['session_timespan', '#unique_sites', 'site_longest_time', 'start_hour', 'day_of_week', 'daytime', 'fb_portion',\
                 'youtube_portion', 'top30_portion', 'bot30_portion', 'prediction', 'target']

In [288]:
# Find duplicates
def delete_dups_max(csv_files_mask, feature_names, site_freq_path="", site_index_path="", dataframe_csv="",
                                    session_length=10, window_size=10, session_time = 30, num_users_for_prediction = 1, sort_in_session=False):
    files = glob(csv_files_mask)
    
    used = {}
    
    for pair in pairs:
        pair_data = pd.DataFrame()
        counter = 1
        
        if pair[0] not in used or pair[1] not in used:
            files = ['kaggle_data/train/user'+pair[0]+'.csv', 'kaggle_data/train/user'+pair[1]+'.csv']
            counter = 1

            for userfile in files:
                user_id = re.search('user(\d+)\.csv', userfile).group(1)
                data = pd.read_csv(userfile, \
                                   parse_dates=[0], infer_datetime_format=True, names=["timestamp"+str(counter), "site"+str(counter)], header=0)
                data.sort_values("timestamp"+str(counter), inplace=True)
                data.reset_index(drop=True, inplace=True)
                pair_data = pd.concat([pair_data, data], axis=1)
                counter +=1
            
            pair_data = pair_data[(pair_data.timestamp1 != pair_data.timestamp2) & \
                                  (pair_data.site1 != pair_data.site2)]

            data1 = pair_data[["timestamp1", "site1"]].dropna().copy()
            data2 = pair_data[["timestamp2", "site2"]].dropna().copy()

            if pair[0] not in used:
                data1.to_csv('kaggle_data/new_train/user'+pair[0]+'.csv', index=False, header=["timestamp", "site"])
                used[pair[0]] = 0
            if pair[1] not in used:
                data2.to_csv('kaggle_data/new_train/user'+pair[1]+'.csv', index=False, header=["timestamp", "site"])
                used[pair[1]] = 0
    return None

In [289]:
delete_dups_max('kaggle_data/train/*',
                                   feature_names=feature_names, 
                                            site_freq_path="kaggle_data/site_freq.pkl", session_length=10, sort_in_session=True)

In [218]:
# Find duplicates
def delete_dups_medium(csv_files_mask, feature_names, site_freq_path="", site_index_path="", dataframe_csv="",
                                    session_length=10, window_size=10, session_time = 30, num_users_for_prediction = 1, sort_in_session=False):
    files = glob(csv_files_mask)
    
    used = {}
    
    for pair in pairs:
        pair_data = pd.DataFrame()
        counter = 1
        
        if pair[0] not in used or pair[1] not in used:
            files = ['kaggle_data/train/user'+pair[0]+'.csv', 'kaggle_data/train/user'+pair[1]+'.csv']
            counter = 1

            for userfile in files:
                user_id = re.search('user(\d+)\.csv', userfile).group(1)
                data = pd.read_csv(userfile, \
                                   parse_dates=[0], infer_datetime_format=True, names=["timestamp"+str(counter), "site"+str(counter)], header=0)
                data.sort_values("timestamp"+str(counter), inplace=True)
                data.reset_index(drop=True, inplace=True)

                data["weekday"+str(counter)] = data["timestamp"+str(counter)].apply(lambda x: int(x.isoweekday()))
                data["hour"+str(counter)] = data["timestamp"+str(counter)].apply(lambda x: int(x.hour))
                pair_data = pd.concat([pair_data, data], axis=1)
                counter +=1
            
            pair_data = pair_data[(pair_data.weekday1 != pair_data.weekday2) & \
                                  (pair_data.site1 != pair_data.site2) & (pair_data.hour1 != pair_data.hour2)]

            data1 = pair_data[["timestamp1", "site1"]].dropna().copy()
            data2 = pair_data[["timestamp2", "site2"]].dropna().copy()

            if pair[0] not in used:
                data1.to_csv('kaggle_data/new_train/user'+pair[0]+'.csv', index=False, header=["timestamp", "site"])
                used[pair[0]] = 0
            if pair[1] not in used:
                data2.to_csv('kaggle_data/new_train/user'+pair[1]+'.csv', index=False, header=["timestamp", "site"])
                used[pair[1]] = 0
    return None

In [217]:
delete_dups_medium('kaggle_data/train/*',
                                   feature_names=feature_names, 
                                            site_freq_path="kaggle_data/site_freq.pkl", session_length=10, sort_in_session=True)

In [None]:
# Find duplicates
def delete_dups_minimal(csv_files_mask, feature_names, site_freq_path="", site_index_path="", dataframe_csv="",
                                    session_length=10, window_size=10, session_time = 30, num_users_for_prediction = 1, sort_in_session=False):
    files = glob(csv_files_mask)
    pair_data = pd.DataFrame()
    
    counter = 1
    
    used = {}
    
    for pair in pairs:
        print pair
        
        if pair[0] not in used or pair[1] not in used:
            files = ['kaggle_data/train_toy/user'+pair[0]+'.csv', 'kaggle_data/train_toy/user'+pair[1]+'.csv']
            counter = 1

            for userfile in files:
                user_id = re.search('user(\d+)\.csv', userfile).group(1)
                data = pd.read_csv(userfile, \
                                   parse_dates=[0], infer_datetime_format=True)
                data.sort_values("timestamp", inplace=True)
                data.reset_index(drop=True, inplace=True)
                data["weekday"] = data.timestamp.apply(lambda x: int(x.isoweekday()))
                data["hour"] = data.timestamp.apply(lambda x: int(x.hour))
                data["target"] = int(user_id)
                print user_id, len(data)
                pair_data = pd.concat([pair_data, data], axis=0)
                counter +=1

            #pair_data = pair_data[(pair_data.timestamp1 != pair_data.timestamp2) & (pair_data.site1 != pair_data.site2)]
            pair_data.drop(pair_data[pair_data.duplicated(subset=["site", "weekday", "hour"], keep=False)].index, inplace=True)
            print len(pair_data)

            data1 = pair_data[pair_data.target == int(pair[0])][["timestamp", "site"]].copy()
            data2 = pair_data[pair_data.target == int(pair[1])][["timestamp", "site"]].copy()

            if pair[0] not in used:
                print 'im here'
                data1.to_csv('kaggle_data/new_train/user'+pair[0]+'.csv', index=False)
                used[pair[0]] = 0
            if pair[1] not in used:
                data2.to_csv('kaggle_data/new_train/user'+pair[1]+'.csv', index=False)
                used[pair[1]] = 0                          

            print used

            break
    return None

In [190]:
delete_dups_minimal('kaggle_data/train_toy/*',
                                   feature_names=feature_names, 
                                            site_freq_path="kaggle_data/site_freq.pkl", session_length=10, sort_in_session=True)

['2631', '786']
2631 357
786 357
4
im here
{'786': 0, '2631': 0}


In [151]:
dataf[(dataf.timestamp1 != dataf.timestamp2) & (dataf.site1 != dataf.site2)]

Unnamed: 0,timestamp1,site1,timestamp2,site2
9,2014-01-25 10:29:31,fpdownload2.macromedia.com,2014-01-25 10:29:38,www.iegallery.com
10,2014-01-25 10:29:33,go.microsoft.com,2014-01-25 10:29:39,www.iegallery.com
12,2014-01-25 10:29:43,www.iegallery.com,2014-01-25 10:29:47,www.google.fr
13,2014-01-25 10:29:47,www.google.fr,2014-01-25 10:29:48,www.google.com
15,2014-01-25 10:29:48,www.google.fr,2014-01-25 10:30:05,www.google.com
17,2014-01-25 10:30:05,www.google.com,2014-01-25 10:30:10,www.google.fr
19,2014-01-25 10:30:10,ajax.googleapis.com,2014-01-25 10:36:59,office14client.microsoft.com
21,2014-01-25 10:36:59,office14client.microsoft.com,2014-01-25 10:41:10,rr.office.microsoft.com
22,2014-01-25 10:41:10,api.bing.com,2014-01-25 10:41:11,integrate.factiva.com
23,2014-01-25 10:41:10,rr.office.microsoft.com,2014-01-25 10:41:12,www.microsofttranslator.com


In [142]:
dataf

Unnamed: 0,timestamp1,site1,timestamp2,site2
0,2014-01-17 14:31:24,fpdownload2.macromedia.com,2014-01-17 14:31:24,fpdownload2.macromedia.com
1,2014-01-17 14:31:27,js.microsoft.com,2014-01-17 14:31:27,windows.microsoft.com
2,2014-01-17 14:31:27,go.microsoft.com,2014-01-17 14:31:27,res2.windows.microsoft.com
3,2014-01-17 14:31:27,ajax.microsoft.com,2014-01-17 14:31:27,js.microsoft.com
4,2014-01-17 14:31:31,www.google.com,2014-01-17 14:31:31,www.google.com
5,2014-01-17 14:32:43,ieonline.microsoft.com,2014-01-17 14:32:43,ieonline.microsoft.com
6,2014-01-17 14:36:02,dl.javafx.com,2014-01-17 14:36:02,dl.javafx.com
7,2014-01-17 14:38:34,www.google.fr,2014-01-17 14:38:34,www.google.fr
8,2014-01-17 14:38:48,www.google.fr,2014-01-17 14:38:48,www.google.fr
9,2014-01-25 10:29:31,fpdownload2.macromedia.com,2014-01-25 10:29:38,www.iegallery.com


In [21]:
handler

'_idf_w8_min'

# Submission

In [130]:
!mv {folder}train{handler}.vw {folder}train{handler}.vw.temp
!shuf {folder}train{handler}.vw.temp -o {folder}train{handler}.vw

In [136]:
%%time
!vw --oaa=550 -d {folder}train{handler}.vw -f {folder}initial_model{handler}.model -b 27 -c -k --passes=30 \
--decay_learning_rate 0.9 --initial_t 0.002337045080352835 -l 0.5416950450219994 \
--power_t 0.5 --loss_function='logistic' --l1=1e-9 --l2=1e-8 -q "sd" -q "sb" --cubic="sbc" \
--stage_poly --batch_sz {int(X_train_sparse.shape[0]/6)} --batch_sz_no_doubling

creating quadratic features for pairs: sd sb 
creating cubic features for triples: sbc 
using l1 regularization = 1e-09
using l2 regularization = 1e-08
final_regressor = kaggle_data/initial_model_idf_w8_wvar.model
Num weight bits = 27
learning rate = 0.541695
initial_t = 0.00233705
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = kaggle_data/train_idf_w8_wvar.vw.cache
Reading datafile = kaggle_data/train_idf_w8_wvar.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0      510        1       37
1.000000 1.000000            2            2.0      325      510       29
1.000000 1.000000            4            4.0      296      325       46
1.000000 1.000000            8            8.0      507       26       17
1.000000 1.000000           16           16.0      497      325       33
1.000000 1.000000           32       

In [137]:
!vw -i {folder}initial_model{handler}.model  -t -d {folder}test{handler}.vw \
-p {folder}vw_test_pred{handler}.csv --quiet

vw_test_pred = pd.read_csv(folder+'vw_test_pred'+handler+'.csv', header=None)
t_submission = pd.DataFrame(vw_test_pred.astype(int)-1)
vw_subm = class_encoder.inverse_transform(t_submission)
write_to_submission_file(vw_subm,
             'kaggle_data/38vw_submission_wvar.csv')
print "Finished creating submission.\n"

Finished creating submission.



In [138]:
!gsutil cp kaggle_data/38vw_submission_wvar.csv gs://smartandnimble/identifyme

Copying file://kaggle_data/38vw_submission_wvar.csv [Content-Type=text/csv]...
-
Operation completed over 1 objects/418.4 KiB.                                    


In [29]:
%%time
#9
!vw --oaa=550 -d {folder}train{handler}.vw \
-f {folder}initial_model{handler}.model -b 28 -c -k \
--passes=5 -l 0.45 --decay_learning_rate=0.9 --l1=4e-8 --l2=4e-8 \
-q "sd" -q "sb" --cubic="sbc" --holdout_period 5 --early_terminate 2
#--keep "s" --keep "b" --keep "c" --keep "d" --keep "a"

!vw -i {folder}initial_model{handler}.model  -t -d {folder}test{handler}.vw \
-p {folder}vw_test_pred{handler}.csv --quiet

vw_test_pred = pd.read_csv(folder+'vw_test_pred'+handler+'.csv', header=None)
t_submission = pd.DataFrame(vw_test_pred.astype(int)-1)
vw_subm = class_encoder.inverse_transform(t_submission)


creating quadratic features for pairs: sd sb 
creating cubic features for triples: sbc 
using l1 regularization = 4e-08
using l2 regularization = 4e-08
final_regressor = kaggle_data/initial_model_idf_w8_bal.model
Num weight bits = 28
learning rate = 0.45
initial_t = 0
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = kaggle_data/train_idf_w8_bal.vw.cache
Reading datafile = kaggle_data/train_idf_w8_bal.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0      330        1       33
1.000000 1.000000            2            2.0      180      330       12
1.000000 1.000000            4            4.0      349       54       23
1.000000 1.000000            8            8.0      198      330       25
1.000000 1.000000           16           16.0      548      433       33
1.000000 1.000000           32           32.0      51

In [58]:
vw_pred = pd.read_csv(folder+'vw_test_pred'+handler+'.csv', header=None)
vw_subm = class_encoder.inverse_transform(vw_pred-1)
vw_subm = np.copy(vw_subm.astype(float).astype(int).astype(str).astype(object))

In [26]:
write_to_submission_file(vw_subm,
             'kaggle_data/32vw_submission_exp.csv')
print "Finished creating submission.\n"

Finished creating submission.



In [25]:
vw_subm

array([['75'],
       ['783'],
       ['1645'],
       ..., 
       ['3165'],
       ['3118'],
       ['1835']], dtype=object)