In [1]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

import re
import pickle
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, make_scorer
from vowpalwabbit.sklearn_vw import VWClassifier, VW
import itertools
from sklearn.decomposition import NMF, TruncatedSVD

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from scipy.sparse import csr_matrix, hstack

In [4]:
def sparsematrix(X):
    row = []
    col = []
    data = []
    for r in range(X.shape[0]):
        row_counter = Counter(X[r])
        for site, num in row_counter.items():
            row.append(r)
            col.append(site)
            data.append(num)
    print "Sparse Matrix - rows:", X.shape[0], "columns:", len(set(col))
    return csr_matrix((data, (row, col)), shape=(X.shape[0], len(set(col))))[:,1:]


def sites_to_sparse_tfidf(train_data, test_data, target_col, session_length, label_encoder=False):
    train_test_df = pd.concat([train_data, test_data])
    train_index_full = list(train_data.index)
    train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]].index)
    test_index_full = list(test_data.index)
    test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]].index)
    train_duplicates_mask = np.transpose([np.in1d(train_index_full, train_index_dup).astype(int)])
    test_duplicates_mask = np.transpose([np.in1d(test_index_full, test_index_dup).astype(int)])

    y = train_data[target_col]

    train_test_df_sites = train_test_df[['site' + str(c) for c in range(1,10+1)]].fillna(0).astype('int')
    train_test_df_sites_array = [" ".join(["s_"+str(s) for s in train_test_df_sites.as_matrix()[i] if int(s) != 0]) \
                                                                  for i in range(train_test_df_sites.shape[0])]

    tfidf = TfidfVectorizer(max_df=0.9).fit(train_test_df_sites_array) #TfidfVectorizer()
    X_train_test_sparse = tfidf.transform(train_test_df_sites_array)

    X_train_sparse = X_train_test_sparse[:len(train_data)]
    X_test_sparse = X_train_test_sparse[len(train_data):]
    
    sites_columns_num = X_train_test_sparse.shape[1]
    
    y_for_vw = None
    class_encoder = None
    if label_encoder:
        class_encoder = LabelEncoder().fit(y.astype('str'))
        y_for_vw = class_encoder.transform(y.astype('str')) + 1
    
    return [X_train_sparse, X_test_sparse, y, y_for_vw, sites_columns_num, class_encoder, tfidf, \
             train_duplicates_mask, test_duplicates_mask]


def features_to_sparse(train_data, test_data, feature_cols):
    features_matrix = []
    for df in [train_data, test_data]:
        num_cols = 0
        data = []
        rows = []
        cols = []
        for label in feature_cols:
            if label in ["day_of_week", "daytime"]:
                coldata = list(df[[label]].values.T[0].astype('float') + 1)
            else:
                coldata = list(df[[label]].values.T[0].astype('float'))
            if len(data):
                data += coldata
            else:
                data = list(coldata)
            if len(cols):
                cols += [num_cols] * len(coldata)
            else:
                cols = [num_cols] * len(coldata)
            num_cols += 1
        rows = [r for r in range(df.shape[0])] * num_cols
        features = csr_matrix((data, (rows, cols)), shape=(df.shape[0], num_cols), dtype=float)
        features_matrix.append(features)
    return features_matrix


def calc_site_times_portions(train_data, test_data):
    site_times = [{},{}]
    count = 0
    for data in [train_data, test_data]:
        for r, row in data[:][range(0, 10)+range(20,30)].iterrows():
            rowdic = {}
            for c, s in [[c, 'site' + str(c)] for c in range(1,10)]:
                if row[s] == 0:
                    continue
                if row[s] in rowdic:
                    rowdic[int(row[s])] += row["time_diff"+str(c)]
                else:
                    rowdic[int(row[s])] = row["time_diff"+str(c)]
            site_times[count][r] = {}
            for site, time in rowdic.items():
                if len(rowdic) == 1:
                    site_times[count][r][int(site)] = 1.0
                    continue
                if time > 0:
                    site_times[count][r][int(site)] = round(float(time)/row["session_timespan"],3)
        count+=1
    return site_times

def site_times_to_sparse(sitetimes):
    row = []
    col = []
    data = []
    rowcount = 0
    for sitetime in sitetimes:
        for r, sites in sitetime.items():
            for site, p in sites.items():
                col.append(site)
                row.append(rowcount)
                data.append(p)
            rowcount+=1
    site_times_sparse = csr_matrix((data, (row, col)), shape=(len(sitetimes[0])+len(sitetimes[1]), max(col)+1), \
                                                                                              dtype=float)[:,1:]
    return site_times_sparse



def combine_sites_features_sparse(sites_train_sparse, features_train_sparse, \
                                  sites_test_sparse, features_test_sparse, \
                                  train_duplicates_mask = None, test_duplicates_mask = None, \
                                  train_site_times_sparse = None, test_site_times_sparse = None, \
                                train_sites_sequence=None, test_sites_sequence=None):
    if train_site_times_sparse is not None and test_site_times_sparse is not None:
        X_train_sparse = hstack([sites_train_sparse, features_train_sparse, \
                                 train_site_times_sparse, train_sites_sequence], dtype=float).tocsr()
        X_test_sparse = hstack([sites_test_sparse, features_test_sparse, \
                                test_site_times_sparse, test_sites_sequence], dtype=float).tocsr()
    else:
        X_train_sparse = hstack([sites_train_sparse, features_train_sparse], dtype=float).tocsr()
        X_test_sparse = hstack([sites_test_sparse, features_test_sparse], dtype=float).tocsr()
        
    #X_train_sparse = hstack([X_train_sparse, train_duplicates_mask], dtype=float).tocsr()
    #X_test_sparse = hstack([X_test_sparse, test_duplicates_mask], dtype=float).tocsr() 
    return [X_train_sparse, X_test_sparse]


def sparse_matrix_to_vw(X_sparse, sites_columns_num, vocabulary, y=None, weights=None, mark_duplicates=False, mycolumns=[]):
    sessions = {}
    used = {}
    prediction = {}
    day_of_week = {}
    start_hour = {}
    daytime = {}
    unique_sites = {}
    top30_portion = {}
    fb_portion = {}
    youtube_portion = {}
    bot30_portion = {}
    site_longest_time = {}
    session_timespan = {}
    sitetimes = {}
    sequence = {}
    
    lables = {}
    lable_weights = {}
    
    #X_sparse = X_sparse_full[:,:-1]
    
    add_features = True

    for r, c in zip(X_sparse.nonzero()[0], X_sparse.nonzero()[1]):
        if tuple([r,c]) not in used:
            used[tuple([r, c])] = 1
            if add_features:
                if c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("prediction"): #- 10:
                    prediction[r] = " |aprediction {}:{}".format(int(X_sparse[r,c]), 100)
                    #prediction[r] = " |prediction:100 {}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("day_of_week"): #- 10:
                    day_of_week[r] = " |bday_of_week {}".format(int(X_sparse[r,c]))
                    #day_of_week[r] = " day_of_week:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("start_hour"): #- 10:
                    start_hour[r] = " |chour_start {}".format(int(X_sparse[r,c]))
                    #start_hour[r] = " start_hour:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("daytime"): #- 10:
                    daytime[r] = " |dtime_of_day {}".format(int(X_sparse[r,c]))
                    #daytime[r] = " daytime:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("session_timespan"): #- 10:
                    session_timespan[r] = " |jsession_timespan time:{}".format(int(X_sparse[r,c]))
                    #session_timespan[r] = " session_timespan:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("#unique_sites"): #- 10:
                    unique_sites[r] = " unique_sites:{}".format(int(X_sparse[r,c]))
                    #unique_sites[r] = " unique_sites:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("site_longest_time"): #- 10:
                    site_longest_time[r] = " |hsite_longest_time {}:{}".format(int(X_sparse[r,c]), 3)
                    #site_longest_time[r] = " site_longest_time:{}".format(int(X_sparse[r,c]))
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("top30_portion"): #- 10:
                    top30_portion[r] = " top30:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("bot30_portion"): #- 10:
                    bot30_portion[r] = " bot30:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("fb_portion"): #- 10:
                    fb_portion[r] = " facebook:{}".format(X_sparse[r,c])
                    continue
                elif c == X_sparse.shape[1] - len(mycolumns) - sites_columns_num + mycolumns.index("youtube_portion"): #- 10:
                    youtube_portion[r] = " youtube:{}".format(X_sparse[r,c])
                    continue
                #elif c >= X_sparse.shape[1] #- 10:
                    #if r not in sequence:
                        #sequence[r] = " |ksequence " + \
                            #' '.join(filter(lambda a: a != "0", X_sparse[r,-10:].todense().astype(int).astype(str).tolist()[0]))
                    #continue
                    
            if c < X_sparse.shape[1] - len(mycolumns): #sites_columns_num: #
                if r in sessions:
                    sessions[r] += " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
                else:
                    if y is not None:
                        if mark_duplicates and int(X_sparse_full[r, -1]): # duplicate row indicator
                            sessions[r] = ' 0.3' + ' |site' + " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
                        else:
                            sessions[r] = ' |site' + " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
                        lables[r] = str(y[r])
                        if weights is not None:
                            lable_weights[r] = str(weights[y[r]-1])
                    else:
                        sessions[r] = ' |site' + " {}:{}".format(int(vocabulary[c]), X_sparse[r,c])
            #elif c > X_sparse.shape[1] - sites_columns_num and c < X_sparse.shape[1] - 10:
                #if r in sitetimes:
                    #sitetimes[r] += " {}:{}".format(int(c - sites_columns_num - len(mycolumns)+1), float(X_sparse[r,c]))
                #else:
                    #sitetimes[r] = ' |isitetime' + " {}:{}".format(int(c - sites_columns_num - len(mycolumns)+1), float(X_sparse[r,c]))
        
    
    return {"sites": sessions, "lables": lables, "lable_weights": lable_weights, "prediction": prediction, "day_of_week": day_of_week, \
                      "start_hour": start_hour, "daytime": daytime, \
                     "unique_site": unique_sites, "top30_portion": top30_portion, \
                    "bot30_portion": bot30_portion, "fb_portion": fb_portion, \
                    "youtube_portion": youtube_portion, "site_longest_time": site_longest_time, \
                    "session_timespan": session_timespan, "sitetimes": sitetimes, "sequence": sequence}



def vw_to_file(sites, out_file, features={}, lables={}, lable_weights={},  quiet=True):   
    vw_writer = open(out_file, 'w')
    final_vw = {}
    gen_features = []
    
    if not quiet:
        print "Features:", features.keys()
        
    for r in sorted(sites.keys()):
        if r in lables:
            final_vw[r] = lables[r]
        else:
            final_vw[r] = ""
        if r in lable_weights:
            final_vw[r] += " {}".format(lable_weights[r])
        final_vw[r] += sites[r] #+ " |features"
        for fname, feature in features.items():
            if fname in ["youtube_portion", "fb_portion", "top30_portion", "bot30_portion", \
                                         "unique_sites"] and r in feature:
                gen_features.append(feature[r])
                continue
            if r in feature:
                final_vw[r] += feature[r]        
            
        if len(gen_features):
            final_vw[r] += " |features"
            for gf in gen_features:
                final_vw[r] += gf
        gen_features = []
        
        #if "prediction" in features and r in features["prediction"]:
            #final_vw[r] += features["prediction"][r]
        
        vw_writer.write(final_vw[r] + "\n")
        
    vw_writer.close()
    
    
def write_to_submission_file(predicted_labels, out_file,
                             target='user_id', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)



In [5]:
def calc_predictions(train_data, test_data, site_dic, user_dic, min_users, max_users, permutations=False):
    train_row_users = {}
    test_row_users = {}
    
    sites_cols = ['site' + str(c) for c in range(1,10+1)]
    
    # Add predictions from the dataframe (based on uniquely visited site)
    for r, v in train_data[["prediction"]].iterrows():
        if int(v) != 0:
            train_row_users[r] = {int(v): 1}  
    
    for r, v in test_data[["prediction"]].iterrows():
        if int(v) != 0:
            test_row_users[r] = {int(v): 1}
    
    # Add predictions if a website in session was visited by less than num_users_for_prediction
    for r, row in train_data[sites_cols+["target"]].iterrows():
        if r in train_row_users:
            continue
        session_predictions = {}
        for site in row:
            predictions = set([])
            if site in site_dic and site in user_dic[int(row["target"])] \
                          and len(site_dic[site]) in range(min_users, max_users):
                predictions = set(site_dic[site])
            if len(predictions):
                for puser in predictions:
                    if puser in session_predictions:
                        session_predictions[puser] +=1
                    else:
                        session_predictions[puser] = 1
                #session_predictions |= predictions
        if len(session_predictions):
            train_row_users[r] = session_predictions
    
    
    for r, row in test_data[sites_cols].iterrows():
        if r in test_row_users:
            continue
        session_predictions = {}
        for site in row:
            predictions = set([])
            if site in site_dic and len(site_dic[site]) in range(min_users, max_users):
                predictions = set(site_dic[site])
            if len(predictions):
                for puser in predictions:
                    if puser in session_predictions:
                        session_predictions[puser] +=1
                    else:
                        session_predictions[puser] = 1
                #session_predictions |= predictions
        if len(session_predictions):
            test_row_users[r] = session_predictions
    
    if not permutations:
        return train_row_users, test_row_users
    
    #Identify sessions with identical sites sequence
    train_index_full = list(train_data.index)
    train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]].index)

    test_index_full = list(test_data.index)
    test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)], keep=False)]\
                           [['site' + str(c) for c in range(1,10+1)]].index)
    
    train_user_dup_rows_dict = {}
    train_dup_row_users_dict = {}

    #test_dup_rows_dict = {} 

    
    
    for r, row in train_data.ix[train_index_dup][sites_cols+["target"]].iterrows():
        if row["target"] in train_user_dup_rows_dict:
            if tuple(row[sites_cols]) in train_user_dup_rows_dict[row["target"]]:
                train_user_dup_rows_dict[row["target"]][tuple(row[sites_cols])] += 1
            else:
                train_user_dup_rows_dict[row["target"]][tuple(row[sites_cols])] = 1 
        else:
            train_user_dup_rows_dict[row["target"]] = {tuple(row[sites_cols]): 1}

        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            train_dup_row_users_dict[tuple(row[sites_cols])].add(row["target"])
        else:
            train_dup_row_users_dict[tuple(row[sites_cols])] = set([row["target"]])
    
    # Make predictions based on duplicate sessions
    for r, row in train_data.ix[train_index_dup][sites_cols].iterrows():        
        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            if r in train_row_users:
                pass #don't overwright predictions from the dataframe
                #train_row_users[r] += train_dup_row_users_dict[tuple(row[sites_cols])]
            else:
                train_row_users[r] = train_dup_row_users_dict[tuple(row[sites_cols])]
    
    for r, row in test_data.ix[test_index_dup][sites_cols].iterrows():  
        #if tuple(row[sites_cols]) in test_dup_rows_dict:
            #test_dup_rows_dict[tuple(row[sites_cols])] += 1
        #else:
            #test_dup_rows_dict[tuple(row[sites_cols])] = 1

        if tuple(row[sites_cols]) in train_dup_row_users_dict:
            if r in test_row_users:
                pass #don't overwright predictions from the dataframe
                #test_row_users[r] += train_dup_row_users_dict[tuple(row[sites_cols])]
            else:
                test_row_users[r] = train_dup_row_users_dict[tuple(row[sites_cols])]
    

    
    
    # Find users who visited 2, 3, 4 websites
    site_pairs = {}
    for r, row in train_data[sites_cols+["target"]].iterrows():
        unique_sites = Counter(row).keys()
        if 0 in unique_sites:
            del unique_sites[unique_sites.index(0)]
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if tuple(subset) in site_pairs:
                    site_pairs[tuple(subset)].add(row["target"])
                else:
                    site_pairs[tuple(subset)] = set([row["target"]])
        #if len(unique_sites) > 3:
            #for subset in itertools.permutations(Counter(row).keys(), 4):
                #if tuple(subset) in site_pairs:
                    #site_pairs[tuple(subset)].add(row["target"])
                #else:
                    #site_pairs[tuple(subset)] = set([row["target"]])
    
    # Add predictions to train data based on 2 visited websites
    for r, row in train_data[sites_cols+["target"]].iterrows():
        if r in train_row_users:
            continue
        unique_sites = Counter(row).keys()
        if 0 in unique_sites:
            del unique_sites[unique_sites.index(0)]
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if tuple(subset) in site_pairs:
                    if r in train_row_users:
                        train_row_users[r] |= site_pairs[subset]
                    else:
                        train_row_users[r] = set(site_pairs[subset])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if tuple(subset) in site_pairs:
                    if r in test_row_users:
                        train_row_users[r] |= site_pairs[subset]
                    else:
                        train_row_users[r] = set(site_pairs[subset])
        #if len(unique_sites) > 3:
            #for subset in itertools.permutations(Counter(row).keys(), 4):
                #if tuple(subset) in site_pairs:
                    #if r in test_row_users:
                        #train_row_users[r].add(site_pairs[subset])
                    #else:
                        #train_row_users[r] = set(site_pairs[subset])
    
    # Add predictions to test data based on 2 visited websites
    for r, row in test_data[sites_cols].iterrows():
        if r in test_row_users:
            continue
        unique_sites = Counter(row).keys()
        if len(unique_sites) > 1:
            for subset in itertools.permutations(Counter(row).keys(), 2):
                if subset in site_pairs:
                    if r in test_row_users:
                        test_row_users[r] |= site_pairs[subset]
                    else:
                        test_row_users[r] = set(site_pairs[subset])
        if len(unique_sites) > 2:
            for subset in itertools.permutations(Counter(row).keys(), 3):
                if subset in site_pairs:
                    if r in test_row_users:
                        test_row_users[r] |= site_pairs[subset]
                    else:
                        test_row_users[r] = set(site_pairs[subset])
        #if len(unique_sites) > 3:
            #for subset in itertools.permutations(Counter(row).keys(), 4):
                #if subset in site_pairs:
                    #if r in test_row_users:
                        #test_row_users[r].add(site_pairs[subset])
                    #else:
                        #test_row_users[r] = set(site_pairs[subset])
        
    
    
    return train_row_users, test_row_users

In [6]:
def predictions_to_vw(predictions):
    new_pred = {}
    
    for row, pred in [[k, v.items()] for k, v in predictions.items() if len(v) ==2]:
        if pred[0][1] != pred[1][1]:
            print "Predictions probabilities are not equal! Breaking!", pred
            break
        new_pred[row] = " |aprediction " + str(pred[0][0]) + ":50" + " " + str(pred[1][0]) + ":50"
    
    ###################
    for row, pred in [[k, v.items()] for k, v in predictions.items() if len(v) ==3]:
        a = pred[0][1]
        b = pred[1][1]
        c = pred[2][1]

        if a == b and b==c:
            new_pred[row] = " |aprediction " + str(pred[0][0]) + ":33" + " " + str(pred[1][0]) + ":33" + \
                                                                            " " + str(pred[2][0]) + ":33"
        else:
            sorted_preds = sorted(pred, key= lambda t: t[1], reverse=True)
            a = sorted_preds[0][1]
            b = sorted_preds[1][1]
            if a == b:
                new_pred[row] = " |aprediction " + str(sorted_preds[0][0]) + ":50" + " " + \
                                                        str(sorted_preds[1][0]) + ":50"
            else:
                new_pred[row] = " |aprediction " + str(sorted_preds[0][0]) + ":100"      
    
    
    #####################
    for row, pred in [[k, v.items()] for k, v in predictions.items() if len(v) ==4]:
        a = pred[0][1]
        b = pred[1][1]
        c = pred[2][1]
        d = pred[3][1]

        if a == b and b==c and c==d:
            new_pred[row] = " |aprediction " + str(pred[0][0]) + ":25" + " " + str(pred[1][0]) + ":25" + \
                                       " " + str(pred[2][0]) + ":25" + " " + str(pred[3][0]) + ":25"
        else:
            sorted_preds = sorted(pred, key= lambda t: t[1], reverse=True)
            a = sorted_preds[0][1]
            b = sorted_preds[1][1]
            c = sorted_preds[2][1]
            if a == b and b==c:
                new_pred[row] = " |aprediction " + str(sorted_preds[0][0]) + ":33" + " " + \
                                           str(sorted_preds[1][0]) + ":33" + " " + str(sorted_preds[2][0]) + ":33"
            else:
                sorted_preds2 = sorted(sorted_preds, key= lambda t: t[1], reverse=True)
                a = sorted_preds2[0][1]
                b = sorted_preds2[1][1]
                if a == b:
                    new_pred[row] = " |aprediction " + str(sorted_preds2[0][0]) + ":50" + " " + \
                                                        str(sorted_preds2[1][0]) + ":50"
                else:
                    new_pred[row] = " |aprediction " + str(sorted_preds2[0][0]) + ":100"
    
    return new_pred

In [7]:
def create_user_site_dic(train_data, site_freq_pkl):
    user_dic = {}
    site_dic = {}

    pkl_file = open(site_freq_pkl, 'rb')
    site_freq = pickle.load(pkl_file)
    top_sites = [v[1] for k, v in sorted(site_freq.items(), key=lambda t: t[1][1], reverse=True)[:0]]

    for i, v in train_data.iterrows():
        if v.target not in user_dic:
            user_dic[v.target] = {}
        for site in ['site' + str(i) for i in range(1,11)]:
            if v[site] != 0 and v[site] not in top_sites:
                if v[site] in user_dic[v.target]:
                    user_dic[v.target][v[site]] +=1
                else:
                    user_dic[v.target][v[site]] = 1

            if v[site] in site_dic:
                site_dic[v[site]].add(v.target)
            else:
                site_dic[v[site]] = set([v.target])
    
    return user_dic, site_dic

In [8]:
def text_classifier(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
        )

# Let's Start

In [66]:
def experiment(existing=False, submission=False, weights=[]):

    accuracy = 0
    best_accuracy = 0
    n_best = 0
    experiment_counter = 0
    first_pass = True
    experiment_weights = []
    
    folder = 'kaggle_data/'
    handler = '_idf_w8_exp'

    while accuracy < 0.7:
        if first_pass:
            if len(weights):
                y_train_weights = weights
                y_weights = weights
            else:
                y_train_weights = [1.0] * 550
                y_weights = [1.0] * 550
            if existing:
                print "Loading existing data"
                with open(folder+'train_part'+handler+'.pkl', 'rb') as f:
                    train_part_vw = pickle.load(f)
                with open(folder+'valid'+handler+'.pkl', 'rb') as f:
                    valid_vw = pickle.load(f)
                with open(folder+'train'+handler+'.pkl', 'rb') as f:
                    train_vw = pickle.load(f)
                with open(folder+'test'+handler+'.pkl', 'rb') as f:
                    test_vw = pickle.load(f)
                with open(folder+'class_encoder'+handler+'.pkl', 'rb') as f:
                    class_encoder = pickle.load(f)
                y=pd.read_csv(folder+'y'+handler+'.csv', header=None, squeeze=True)
                y_train=pd.read_csv(folder+'y_train'+handler+'.csv', header=None, squeeze=True)
                y_valid=pd.read_csv(folder+'y_valid'+handler+'.csv', header=None, squeeze=True)
            else:
                print "Loading data from CSV files"
                train_data = pd.read_csv('kaggle_data/full_train_w8.csv')
                #train_data = pd.read_csv('kaggle_data/full_train_w8_balanced.csv')
                test_data = pd.read_csv('kaggle_data/full_test.csv')

                #train_site_sequence = csr_matrix(train_data[['site' + str(c) for c in range(1,10+1)]].as_matrix(), dtype=int)
                #test_site_sequence = csr_matrix(test_data[['site' + str(c) for c in range(1,10+1)]].as_matrix(), dtype=int)
                
                #print "Calculating session time"
                # Additionally, let's calculate the percentage of session time spent by every site in session
                #site_times = calc_site_times_portions(train_data, test_data)

                # Convert site times to sparse format
                #site_times_sparse = site_times_to_sparse(site_times)
                #train_site_times_sparse = site_times_sparse[:len(train_data)]
                #test_site_times_sparse = site_times_sparse[len(train_data):]
                
                if submission:
                    print "Calculating predictions"
                    user_dic, site_dic = create_user_site_dic(train_data, "kaggle_data/site_freq.pkl")
                    train_predictions, test_predictions = calc_predictions(train_data, test_data, \
                                                       site_dic, user_dic, 2, 4)
                    train_add_predictions = predictions_to_vw(train_predictions)
                    test_add_predictions = predictions_to_vw(test_predictions)
                
                print "Creating sparse matrices"
                ######################
                train_test_df = pd.concat([train_data, test_data])
 
                session_length = 10
                
                #train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                                       #[['site' + str(c) for c in range(1,10+1)]+["target"]].index)
                #test_index_full = list(test_data.index)
                #test_index_dup = list(test_data[test_data.duplicated(subset=['site' + str(c) for c in range(1,session_length+1)], keep=False)]\
                                       #[['site' + str(c) for c in range(1,10+1)]].index)
                #train_duplicates_mask = np.transpose([np.in1d(train_index_full, train_index_dup).astype(int)])
                #test_duplicates_mask = np.transpose([np.in1d(test_index_full, test_index_dup).astype(int)])

                y = train_data["target"]

                train_test_df_sites = train_test_df[['site' + str(c) for c in range(1,10+1)]].fillna(0).astype('int')               
                train_test_df_sites_array = [" ".join(["s_"+str(s) for s in train_test_df_sites.as_matrix()[i] if int(s) != 0]) \
                                                                              for i in range(train_test_df_sites.shape[0])]

                tfidf = TfidfVectorizer(analyzer=str.split, max_df=1.0, ngram_range=(1,3)).fit(train_test_df_sites_array) #TfidfVectorizer()
                X_train_test_sparse = tfidf.transform(train_test_df_sites_array)

                X_train_sparse = X_train_test_sparse[:len(train_data)]
                X_test_sparse = X_train_test_sparse[len(train_data):]

                class_encoder = LabelEncoder().fit(y.astype('str'))
                y_for_vw = class_encoder.transform(y.astype('str')) + 1

                sites_columns_num = X_train_test_sparse.shape[1]
                inv_vocabulary = {v: int(re.search("s_(\d+)$", k).group(1)) for k, v in tfidf.vocabulary_.iteritems()}

                
                #####################

                mycolumns = [label for label in test_data[range(20, test_data.shape[1])]]

                train_features, test_features = features_to_sparse(train_data, test_data, mycolumns)

                #X_train_sparse, X_test_sparse = combine_sites_features_sparse(X_train_sparse, train_features, \
                                                                             #X_test_sparse, test_features, \
                                                                              #train_duplicates_mask, test_duplicates_mask,
                                                                              #train_site_times_sparse, test_site_times_sparse, \
                                                                             #train_site_sequence, test_site_sequence)
                
                X_train_sparse, X_test_sparse = combine_sites_features_sparse(X_train_sparse, train_features, \
                                                                             X_test_sparse, test_features)
                

                X_train, X_valid, y_train, y_valid = train_test_split(X_train_sparse, y_for_vw, test_size=0.3, stratify=y_for_vw)

                
                print "Converting sparse matrices to vw-format"
                train_part_vw = sparse_matrix_to_vw(X_train, 0, inv_vocabulary, y_train, weights=y_train_weights, mycolumns = mycolumns)
                valid_vw = sparse_matrix_to_vw(X_valid, 0, inv_vocabulary, y_valid, mycolumns = mycolumns)
                train_vw = sparse_matrix_to_vw(X_train_sparse, 0, inv_vocabulary, y_for_vw, weights=y_weights, mycolumns = mycolumns)
                test_vw = sparse_matrix_to_vw(X_test_sparse, 0, inv_vocabulary, mycolumns = mycolumns)
                
                if submission:
                    for k in train_add_predictions.keys():
                        if k not in train_vw["prediction"]:
                            train_vw["prediction"][k] = train_add_predictions[k]
                        else:
                            print "ERROR! Same key!"

                    for k in test_add_predictions.keys():
                        if k not in test_vw["prediction"]:
                            test_vw["prediction"][k] = test_add_predictions[k]
                        else:
                            print "ERROR! Same key!"
                
                print "Saving vw files"
                with open(folder+'train_part'+handler+'.pkl', 'wb') as f:
                    pickle.dump(train_part_vw, f)
                with open(folder+'valid'+handler+'.pkl', 'wb') as f:
                    pickle.dump(valid_vw, f)
                with open(folder+'train'+handler+'.pkl', 'wb') as f:
                    pickle.dump(train_vw, f)
                with open(folder+'test'+handler+'.pkl', 'wb') as f:
                    pickle.dump(test_vw, f)
                with open(folder+'class_encoder'+handler+'.pkl', 'wb') as f:
                    pickle.dump(class_encoder, f)

                y.to_csv(folder+'y'+handler+'.csv', index=False, header=False)
                pd.DataFrame(y_train).to_csv(folder+'y_train'+handler+'.csv', index=False, header=False)
                pd.DataFrame(y_valid).to_csv(folder+'y_valid'+handler+'.csv', index=False, header=False)

            first_pass = False

            ########################
    
        
        
        keys = ['day_of_week', 'daytime', 'prediction', 'start_hour', 'bot30_portion', 'top30_portion']
        #, 'youtube_portion', 'fb_portion', 'sitetimes', 'sequence']

        vw_to_file(train_part_vw["sites"], folder+'train_part'+handler+'.vw', \
                   features={x:train_part_vw[x] for x in keys}, \
                   lables=train_part_vw["lables"], lable_weights=train_part_vw["lable_weights"], quiet=True)
        vw_to_file(valid_vw["sites"], folder+'valid'+handler+'.vw', features={x:valid_vw[x] for x in keys}, \
                   lables=valid_vw["lables"], quiet=True)
        vw_to_file(train_vw["sites"], folder+'train'+handler+'.vw', features={x:train_vw[x] for x in keys}, \
                   lables=train_vw["lables"], lable_weights=train_vw["lable_weights"], quiet=True)
        vw_to_file(test_vw["sites"], folder+'test'+handler+'.vw', features={x:test_vw[x] for x in keys}, quiet=True)
        
        f = open(folder+'train_part'+handler+'.vw')
        train_part_file = f.readlines()
        f.close()

        f = open(folder+'train'+handler+'.vw')
        train_file = f.readlines()
        f.close()

        f = open(folder+'valid'+handler+'.vw')
        valid_file = f.readlines()
        f.close()

        f = open(folder+'test'+handler+'.vw')
        test_file = f.readlines()
        f.close()
        
        model = VW(oaa=550, passes=5, b=27, convert_to_vw=False, \
                              cubic="sbc", q="sd", random_seed=7)
        
        if submission:
            model.fit(train_file)
            predictions = model.predict(test_file)
            t_submission = pd.DataFrame(predictions.astype(int)-1)
            vw_subm = class_encoder.inverse_transform(t_submission)
            write_to_submission_file(vw_subm,
                         'kaggle_data/29vw_submission_exp.csv')
            print "Finished creating submission.\n"
            return None
        else:
            #model.fit(train_part_file)
            #predictions = model.predict(valid_file)
            #accuracy = accuracy_score(y_valid, predictions)
            
            !vw --oaa=550 -d {folder}train_part{handler}.vw -f {folder}initial_model{handler}.model -b 27 -c -k --passes=30 \
            --decay_learning_rate 0.9 --initial_t 0.002337045080352835 -l 0.5416950450219994 \
            --power_t 0.5 --loss_function='logistic' --l1=1e-10 --l2=1e-10 -q "sd" -q "sb" --cubic="sbc" \
            --stage_poly --batch_sz 12148 --batch_sz_no_doubling
            
            !vw -i {folder}initial_model{handler}.model  -t -d {folder}valid{handler}.vw \
            -p {folder}vw_valid_pred{handler}.csv --quiet

            vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
            accuracy = accuracy_score(y_valid, vw_valid_pred.values)
            print "Accuracy:", accuracy
            return None

            #!vw --oaa=550 -d {folder}train_part{handler}.vw \
            #-f {folder}initial_model{handler}.model -b 28 -c -k \
            #--passes=5 \
            #-q "sd" -q "sb" --cubic="sbc"  \
            #--keep "s" --keep "b" --keep "c" --keep "d" --keep "a" --quiet

            #!vw -i {folder}initial_model{handler}.model  -t -d {folder}valid{handler}.vw \
            #-p {folder}vw_valid_pred{handler}.csv --quiet

            #vw_valid_pred = pd.read_csv(folder+'vw_valid_pred'+handler+'.csv', header=None)
            #accuracy = accuracy_score(y_valid, vw_valid_pred.values)
            
            print "Experiment #", experiment_counter, "Accuracy:", accuracy
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                print "BEST Accuracy! #", n_best, "\n"
                multiplier = 0.001
                global_experiment_weights.append(y_train_weights)
                n_best +=1
                
                #M = confusion_matrix(y_valid, vw_valid_pred.values)
                #M = confusion_matrix(y_valid, predictions)
                #M_normalized = M.astype('float') / M.sum(axis=1)[:, np.newaxis]

            else:
                #y_train_weights = global_experiment_weights[-1]
                multiplier = multiplier - 0.00001
                if multiplier == 0:
                    print "Can't optimize further"
                    return None
            
            countery = Counter(y_train)
            confusion = {}
            M = confusion_matrix(y_valid, predictions)
            M_normalized = M.astype('float') / M.sum(axis=1)[:, np.newaxis]
            for (t,f), value in np.ndenumerate(M):
                if t != f and value > 0:
                    confusion[tuple([t, f])] = value
            
            one_confusion = {}
            for k, v in confusion.items():
                if tuple([k[1], k[0]]) not in confusion:
                    one_confusion[k] = v
            
            chances = [[[tf[0], countery[tf[0]+1]], [tf[1], countery[tf[1]+1]], [val]] for tf, val in sorted(one_confusion.items(), \
                key=lambda t: t[1], reverse = True)]
            
            #M = confusion_matrix(y_valid, predictions)
            ##M_normalized = M.astype('float') / M.sum(axis=1)[:, np.newaxis]
            #max_value = 0
            #maxtf = []
            #scores = {}
            #for (t,f), value in np.ndenumerate(M):
                #if t != f and value > 0:
                    #if value > max_value:
                        #max_value = value
                        #maxtf = [t, f]
                    #scores[tuple([t, f])] = value
            print "Total:", len(chances)
            usedt = {}
            for t, f, val in chances:
                if t[0] in usedt:
                    continue
                #print t, f, val
                #print "old weight", y_train_weights[t[0]]
                y_train_weights[t[0]] += 0.01
                usedt[t[0]] = 1
                #print "new weight", y_train_weights[t[0]]
                #break
            
            #print "Confusion", max_value, maxtf
            #print "current weight", y_train_weights[maxtf[0]], y_train_weights[maxtf[1]]
            #y_train_weights[maxtf[0]] += y_train_weights[maxtf[0]] * 0.05
            #y_train_weights[maxtf[1]] -= y_train_weights[maxtf[1]] * max_value
            #print "new weight", y_train_weights[maxtf[0]], y_train_weights[maxtf[1]], "\n"

            for r, y in train_part_vw["lables"].items():
                if train_part_vw["lable_weights"][r] != str(y_train_weights[int(y)-1]):
                    train_part_vw["lable_weights"][r] = str(y_train_weights[int(y)-1])

            experiment_counter +=1

In [62]:
global_experiment_weights = []

In [67]:
len(global_experiment_weights)

3

In [68]:
%%time
experiment(existing=True, weights=global_experiment_weights[1])

Loading existing data
creating quadratic features for pairs: sd sb 
creating cubic features for triples: sbc 
using l1 regularization = 1e-10
using l2 regularization = 1e-10
final_regressor = kaggle_data/initial_model_idf_w8_exp.model
Num weight bits = 27
learning rate = 0.541695
initial_t = 0.00233705
power_t = 0.5
decay_learning_rate = 0.9
creating cache_file = kaggle_data/train_part_idf_w8_exp.vw.cache
Reading datafile = kaggle_data/train_part_idf_w8_exp.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0      396        1       29
1.000000 1.000000            2            2.0      155      396       13
1.000000 1.000000            4            4.0      223      396       13
1.000000 1.000000            8            8.0      279      396       37
1.000000 1.000000           16           16.0      282      396       37
1.000000

Exception AttributeError: "'example' object has no attribute 'finished'" in <bound method example.__del__ of <vowpalwabbit.pyvw.example object at 0x7fb210c17a48>> ignored
Exception KeyboardInterrupt in <bound method VW.__del__ of {'b': 27, 'random_seed': 7, 'oaa': 550, 'cubic': 'sbc', 'quiet': True, 'q': 'sd'}> ignored


ValueError: No columns to parse from file

- window 8, sbcda + bot30 + top30
Experiment # 0 Accuracy: 0.592839935352
Experiment # 0 Accuracy: 0.591223736773
Experiment # 0 Accuracy: 0.591925105968
Experiment # 0 Accuracy: 0.59140670265
Experiment # 0 Accuracy: 0.593876741988
Experiment # 0 Accuracy: 0.592016588906
Experiment # 0 Accuracy: 0.592352026347
Experiment # 0 Accuracy: 0.592656969475
Experiment # 0 Accuracy: 0.592230049096
Experiment # 0 Accuracy: 0.591864117342

- window 8, sbcda + bot30
Experiment # 0 Accuracy: 0.592321532034
Experiment # 0 Accuracy: 0.591376208337
Experiment # 0 Accuracy: 0.590827310707
Experiment # 0 Accuracy: 0.591955600281
Experiment # 0 Accuracy: 0.592809441039

- window 8, sbcdak
Experiment # 0 Accuracy: 0.58131308511
Experiment # 1 Accuracy: 0.579422437715
Experiment # 2 Accuracy: 0.580672704541
Experiment # 3 Accuracy: 0.58012380691
Experiment # 4 Accuracy: 0.577653767572
Experiment # 5 Accuracy: 0.578660079895
Experiment # 6 Accuracy: 0.579300460464

- window 8, sbcdai
Experiment # 0 Accuracy: 0.577836733449
Experiment # 1 Accuracy: 0.577135364255
Experiment # 2 Accuracy: 0.577348824444
Experiment # 3 Accuracy: 0.578538102644
Experiment # 4 Accuracy: 0.577165858567
Experiment # 5 Accuracy: 0.577226847193
Experiment # 6 Accuracy: 0.577531790321
Experiment # 7 Accuracy: 0.576159546245
Experiment # 8 Accuracy: 0.577379318757
Experiment # 9 Accuracy: 0.577318330131
Experiment # 10 Accuracy: 0.576677949562
Experiment # 11 Accuracy: 0.578294148141
Experiment # 12 Accuracy: 0.577653767572
Experiment # 13 Accuracy: 0.578629585582

- window 8, sbcda
Experiment # 0 Accuracy: 0.587107004544
Experiment # 1 Accuracy: 0.587259476108
Experiment # 2 Accuracy: 0.587594913549
Experiment # 3 Accuracy: 0.587899856677
Experiment # 4 Accuracy: 0.589577043881
Experiment # 5 Accuracy: 0.588875674687
Experiment # 6 Accuracy: 0.58793035099
Experiment # 7 Accuracy: 0.587411947672
Experiment # 8 Accuracy: 0.588052328241
Experiment # 9 Accuracy: 0.588143811179
Experiment # 10 Accuracy: 0.586558106913
Experiment # 11 Accuracy: 0.587655902174
Experiment # 12 Accuracy: 0.588357271369
Experiment # 13 Accuracy: 0.587747385113
Experiment # 14 Accuracy: 0.587228981795
Experiment # 15 Accuracy: 0.58674107279
Experiment # 16 Accuracy: 0.589211112128

In [15]:
train_data = pd.read_csv('kaggle_data/full_train_w8.csv')
test_data = pd.read_csv('kaggle_data/full_test.csv')

#### Dups

In [16]:
train_index_dup = list(train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)])]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]].index)

In [19]:
train_data[train_data.duplicated(subset=['site' + str(c) for c in range(1,10+1)])]\
                           [['site' + str(c) for c in range(1,10+1)]+["target"]+["prediction"]]

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,target,prediction


In [17]:
train_data.drop(train_index_dup, inplace=True)

In [18]:
train_data.reset_index(inplace=True, drop=True)

In [107]:
train_df_sites = train_data[['site1', 'site2', 'site3', 
                                     'site4','site5', 
                                     'site6','site7', 'site8', 
                                     'site9', 'site10']].fillna(0).astype('int')
X_train_sparse = sparsematrix(train_df_sites.as_matrix())

Sparse Matrix - rows: 104888 columns: 21533


In [111]:
X_train_sparse.getrow(0).nonzero()[1]

array([  15,   16,   36,   46,   52, 3536, 5758], dtype=int32)

#### Dups end

In [20]:
max_number = np.max(Counter(train_data.target).values())
user_counter = Counter(train_data.target)

In [21]:
%%time
train_dataf = pd.DataFrame(columns=train_data.columns)
for user, num in user_counter.items():
    rep = int(max_number / float(num))
    train_dataf = train_dataf.append([train_data[train_data.target == user]]*rep)

CPU times: user 2min 14s, sys: 5min 51s, total: 8min 6s
Wall time: 8min 6s


In [31]:
train_dataf.reset_index(drop=True, inplace=True)

In [33]:
train_dataf.to_csv("kaggle_data/full_train_w8_balanced.csv", index=False)

In [40]:
np.max(Counter(train_dataf.target).values())

3813