In [1]:
import numpy as np
import pandas as pd
import pickle
from glob import glob
import re

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
folder = "new_kaggle_data/"

In [4]:
# Progress tracker from https://github.com/alexanderkuk/log-progress
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

In [5]:
def get_site_index(csv_file):
    site_index = pd.read_csv(csv_file, header=None, names=['siteid', 'site'], index_col=1)
    site_dic = site_index.to_dict()
    return site_dic["siteid"]

In [6]:
test_df = pd.read_csv(folder+'test_sessions.csv', \
                      parse_dates=range(10,20), \
                      infer_datetime_format=True)

In [7]:
len(test_df)

128877

In [9]:
site_index = get_site_index(folder+"site_indexes.txt")
print len(site_index)

57654


In [10]:
id_site_dic = {}
for k, v in site_index.items():
    id_site_dic[v] = k

In [11]:
data = []
for session in test_df.values:
    for i in range (0, 10):
        if int(session[i]) != 0:
            row = [session[10 + i], id_site_dic[int(session[i])]]
            data.append(row)
data = pd.DataFrame(data, columns=["timestamp", "site"])

data.to_csv(folder+"full_test_temp.csv", index=False)

In [5]:
def sitefreq(sites, site_freq={}, site_index={}):
    if not len(site_index):
        if len(site_freq):
            site_id = max(site_freq.items(), key=lambda t: t[1][0])[1][0] + 1
        else:
            site_id = 1
    for site in sites:
        if len(site_index):
            site_id = site_index[site]
                
        if site not in site_freq:
            site_freq[site] = [site_id, 1]
            site_id += 1
        else:
            site_freq[site][1] += 1
            
    return site_freq

In [6]:
def prepare_train_set_with_fe(csv_files_mask, feature_names, site_index={}, site_freq_path="", dataframe_csv="",
                                    session_length=10, window_size=10, session_time = 30,\
                                sort_in_session=False,
                             freq_only=False, prediction=False):
    
    global_window_size = window_size
    train_data = np.array([np.zeros(len(feature_names))])
    site_freq = {}
    session_num = 0
    temp_session_length = session_length
    user_count = 1
    
    if site_freq_path != "":
        pkl_file = open(site_freq_path, 'rb')
        site_freq = pickle.load(pkl_file)
    
    if len(site_freq):
        regex=re.compile(".*(facebook).*")
        facebook_ix = [site_freq[i][0] for i in [m.group(0) for l in site_freq.keys() for m in [regex.search(l)] if m]]
        regex=re.compile(".*(youtube).*")
        youtube_ix = [site_freq[i][0] for i in [m.group(0) for l in site_freq.keys() for m in [regex.search(l)] if m]]

        num_more_100 = len([[k, v] for k, v in sorted(site_freq.items(), key=lambda t: t[1][1], reverse=False) if v[1] > 1000])
        top30_ix = [v[0] for k, v in sorted(site_freq.items(), key=lambda t: t[1][1], reverse=True)[:num_more_100]]
        num_less_100 = len([[k, v] for k, v in sorted(site_freq.items(), key=lambda t: t[1][1], reverse=False) if v[1] < 50])
        bot30_ix = [v[0] for k, v in sorted(site_freq.items(), key=lambda t: t[1][1], reverse=False)[:num_less_100]]
    else:
        freq_only = True
        print "Building site freq."
    
    data = None
    if dataframe_csv != "":
        data = pd.read_csv(dataframe_csv, parse_dates=[0], infer_datetime_format=True)
        files = [0]
    else:
        files = glob(csv_files_mask)
        
    for userfile in log_progress(files, every=1):
        if dataframe_csv == "":
            window_size = global_window_size
            user_id = int(re.search('user(\d+)\.csv', userfile).group(1))
            data = pd.read_csv(userfile, \
                               parse_dates=[0], infer_datetime_format=True)

            if not len(data):
                continue
            data.sort_values("timestamp", inplace=True)
            data.reset_index(drop=True, inplace=True)
            
            user_count += 1
        session_hash = {}    
        
        if freq_only: site_freq = sitefreq(data.site, site_freq, site_index)
            
        if not freq_only:
        
            session = []
            timestamps = []
            next_session = []
            next_t_session = []
            next_s_session = []
            next_timestamps = []

            for i in range (0, len(data), window_size):  
                session += list(data[i:i+session_length].site.apply(lambda x: site_index[x]))
                timestamps += list(data[i:i+session_length].timestamp)

                if window_size < session_length and sort_in_session:
                    ses_ts_zip = sorted(set(zip(session, timestamps)), key = lambda t: t[1])
                    session, timestamps = zip(*ses_ts_zip)
                    session = list(session)
                    timestamps = list(timestamps)

                while ((len(session) >= session_length) or (not len(data[i+window_size:]) and len(session))):
                    time_diff = [(timestamps[n+1] - timestamps[n]).total_seconds() for n in range(0, len(session)-1)]
                    session_timespan = (max(timestamps) - min(timestamps)).total_seconds()
                    next_session = []
                    next_timestamps = []
                    
                    while session_timespan > session_time*60 or len(session) > session_length:
                        next_session.insert(0, session.pop())
                        next_timestamps.insert(0, timestamps.pop())
                        time_diff = [(timestamps[n+1] - timestamps[n]).total_seconds() for n in range(0, len(session)-1)]
                        session_timespan = (max(timestamps) - min(timestamps)).total_seconds()

                    session = tuple(session)

                    if session not in session_hash or dataframe_csv != "":                      
                        session_hash[session] = 1
                        session = list(session)                       
                                                
                        if dataframe_csv != "" and sort_in_session: #need to sort sites in sessions by timestamp in test data
                            ses_ts_zip = sorted(set(zip(session, timestamps)), key = lambda t: t[1])
                            session, timestamps = zip(*ses_ts_zip)
                            session = list(session)
                            timestamps = list(timestamps)
                            time_diff = [(timestamps[n+1] - timestamps[n]).total_seconds() for n in range(0, len(session)-1)]
                            session_timespan = (max(timestamps) - min(timestamps)).total_seconds()
                        
                        num_unique_sites = len(set(session))
                        start_hour = min(timestamps).hour
                        day_of_week = min(timestamps).weekday()
                        
                        #сайт, на котором пользователь находился дольше всего в сессии
                        if len(session) == 1:
                            site_longest_time = session[0]
                        else:
                            site_longest_time = session[time_diff.index(max(time_diff))]
                        
                        #доля facebook в сессии по времени
                        facebook_in_session = np.where(np.in1d(session, facebook_ix) == True)[0]
                        facebook_times = [time_diff[t] for t in facebook_in_session if t < len(time_diff)]
                        fb_portion = sum(facebook_times)/session_timespan if len(facebook_times) and session_timespan else 0
                        #print(facebook_in_session)

                        #доля youtube в сессии по времени
                        youtube_in_session = np.where(np.in1d(session, youtube_ix) == True)[0]
                        youtube_times = [time_diff[t] for t in youtube_in_session if t < len(time_diff)]
                        youtube_portion = sum(youtube_times)/session_timespan if len(youtube_times) and session_timespan else 0

                        #доля топ30 сайтов в сессии по времени
                        top30_in_session = np.where(np.in1d(session, top30_ix) == True)[0]
                        top30_times = [time_diff[t] for t in top30_in_session if t < len(time_diff)]
                        top30_portion = sum(top30_times)/session_timespan if len(top30_times) and session_timespan else 0
                        
                        #доля бот30 сайтов в сессии по времени
                        bot30_in_session = np.where(np.in1d(session, bot30_ix) == True)[0]
                        bot30_times = [time_diff[t] for t in bot30_in_session if t < len(time_diff)]
                        bot30_portion = sum(bot30_times)/session_timespan if len(bot30_times) and session_timespan else 0

                        #время суток начала сессии: 5-12 утро, 12-17 - день, 18-23 вечер, 0-5 ночь
                        if start_hour in range(5,12):
                            daytime = 0
                        elif start_hour in range(12, 18):
                            daytime = 1
                        elif start_hour > 18:
                            daytime = 2
                        elif start_hour < 5:
                            daytime = 3
                            
                        if dataframe_csv != "":
                            session_length = temp_session_length
                            

                        session_prediction = 0
                        if prediction:
                            for site in session:
                                if site in site_user_dic and len(site_user_dic[site]) == 1:
                                    session_prediction = list(site_user_dic[site])[0]
                                    break   

                        session.extend([0] * (session_length - len(session)) + \
                                       timestamps + [0] * (session_length - len(timestamps)) + \
                                       time_diff + \
                                       [0]*(session_length - len(time_diff) - 1) + \
                                       [session_timespan, num_unique_sites, site_longest_time, start_hour, day_of_week, daytime, fb_portion,\
                                youtube_portion, top30_portion, bot30_portion, session_prediction])
                        if dataframe_csv == "":
                            session.extend([user_id])
                        

                        train_data = np.concatenate((train_data, np.array([session])))
                        session_num += 1

                    session = next_session
                    timestamps = next_timestamps
            
            if len(next_session):
                print("ERROR! next_session left!")
                print(session)
                return None

    if freq_only:
        with open(folder+'site_freq.pkl', 'wb') as site_freq_pkl:
            pickle.dump(site_freq, site_freq_pkl)
        return site_freq
    
    train_data = np.delete(train_data, 0, 0)
    train_data = pd.DataFrame(train_data, columns=feature_names)
    return train_data

In [7]:
feature_names = ['site' + str(i) for i in range(1,11)] + ['time' + str(i) for i in range(1,11)] + \
                ['time_diff' + str(j) for j in range(1,10)] + \
                ['session_timespan', '#unique_sites', 'site_longest_time', 'start_hour', 'day_of_week', 'daytime', 'fb_portion',\
                 'youtube_portion', 'top30_portion', 'bot30_portion', 'prediction', 'target']

In [15]:
%%time
site_freq  = prepare_train_set_with_fe(folder+'train/*',
                                   feature_names=feature_names, 
                                            freq_only=True, site_index = site_index, session_length=10)

Building site freq.
CPU times: user 19.9 s, sys: 1.75 s, total: 21.6 s
Wall time: 19.7 s


In [17]:
freqs = [v[1] for v in site_freq.values()]
print min(freqs), max(freqs), np.mean(freqs), np.median(freqs), len(freqs)

1 133905 58.2483637928 2.0 44157


In [18]:
train_data_initial = pd.read_csv(folder+"train_sessions.csv").fillna(0)

In [19]:
%%time
user_site_dic = {}
site_user_dic = {}

pkl_file = open(folder+"site_freq.pkl", 'rb')
site_freq = pickle.load(pkl_file)
print "Rows:", len(train_data_initial)
for i, v in train_data_initial.iterrows():
    userid = int(v.user_id)
    if userid not in user_site_dic:
        user_site_dic[userid] = {}
    for site in ['site' + str(i) for i in range(1,11)]:
        ssite = int(v[site])
        if ssite != 0:
            if ssite in user_site_dic[userid]:
                user_site_dic[userid][ssite] +=1
            else:
                user_site_dic[userid][ssite] = 1
        
        if ssite in site_user_dic:
            site_user_dic[ssite].add(userid)
        else:
            site_user_dic[ssite] = set([userid])

Rows: 257014
CPU times: user 50.2 s, sys: 324 ms, total: 50.5 s
Wall time: 50.1 s


In [22]:
%%time
train_data  = prepare_train_set_with_fe(folder+'train/*',
                                feature_names=feature_names, 
                                site_freq_path=folder+"site_freq.pkl", \
                                session_length=10, prediction=True, site_index = site_index)

CPU times: user 4h 9s, sys: 18min 22s, total: 4h 18min 31s
Wall time: 4h 18min 30s


In [23]:
%%time
test_feature_names = feature_names[:-1]
test_data  = prepare_train_set_with_fe(folder+'train/*', dataframe_csv=folder+"full_test_temp.csv",
                                feature_names=test_feature_names, site_index = site_index, \
                                site_freq_path=folder+"site_freq.pkl", session_length=10, \
                                prediction=True, sort_in_session=True)

CPU times: user 1h 13min 25s, sys: 1min 17s, total: 1h 14min 42s
Wall time: 1h 14min 41s


In [24]:
test_data.to_csv(folder+"full_test.csv", index=False)
train_data.to_csv(folder+"full_train.csv", index=False)

# Vowpal Wabbit