In [1]:
import numpy as np
import pandas as pd
import time
import pickle
import os

In [2]:
def decodeTime(data):
    MINUTES_IN_DAY = 60 * 24
    
    day_of_week = (data['time'] // MINUTES_IN_DAY) % 7
    time_of_day = data['time'] % MINUTES_IN_DAY

    data['day_of_week'] = day_of_week
    data['time_of_day'] = time_of_day
    
    return data

#data = decodeTime(pd.read_csv('../input/train.csv'))
data = decodeTime(pd.read_csv('../input/train09_100.csv'))

In [3]:
bins_day_of_week = (min(data['day_of_week']), max(data['day_of_week']), 7 ) # a day
bins_time_of_day = (min(data['time_of_day']), max(data['time_of_day']), 48) # a half an hour
bins_accuracy = (min(data['accuracy']), max(data['accuracy']), 10) # an unknown characterisitc

In [4]:
def write_meta(bins, f):
    f.write(str(bins[0]) + '\n' + str(bins[1]) + '\n' + str(bins[2]))

with open('generated/hist_day_of_week.meta', 'w') as f:
    write_meta(bins_day_of_week, f)
with open('generated/hist_time_of_day.meta', 'w') as f:
    write_meta(bins_time_of_day, f)
with open('generated/hist_accuracy.meta', 'w') as f:
    write_meta(bins_accuracy, f)

In [5]:
place_ids = data['place_id'].unique()

In [6]:
def put_in_bin(value, bins, out_x):
    lo = bins[0]
    hi = bins[1]
    n = bins[2]
    
    pos = int((value - lo) / (hi - lo) * (n - 1))
    out_x[pos] += 1


def hist(data, bins):    
    x = np.zeros(bins[2])   
    data.apply(put_in_bin, bins=bins, out_x=x)
    return x / data.shape[0]

In [7]:
list_of_probs = []
list_of_time = []
list_of_days = []
list_of_accuracies = []

GAP = 10000
i = 0
start = 0
finish = 0

for id in place_ids:
    if i % GAP == 0:
        finish = time.time()
        if i > 0:
            velocity = GAP / (finish - start)
            print(i, round(velocity, 1), 'id/sec',  round((len(place_ids) - i) / velocity / 60 / 60, 1), 'hours left')
        start =  time.time()
    i+=1

    subdata = data[data['place_id'] == id]

    #Prior probabilities
    list_of_probs.append({'id' : id, 'prob' : subdata.shape[0] / data.shape[0]})    

    #Histogram for days of a week
    list_of_days.append(hist(subdata['day_of_week'], bins_day_of_week))

    #Histogram for time of a day
    list_of_time.append(hist(subdata['time_of_day'], bins_time_of_day))

    #Histogram for accuracies
    list_of_accuracies.append(hist(subdata['accuracy'], bins_accuracy))

In [8]:
pd.DataFrame(list_of_probs).to_csv('generated/prior_probabilities.csv')
pd.DataFrame(list_of_time).to_csv('generated/hist_time_of_day.csv')
pd.DataFrame(list_of_days).to_csv('generated/hist_day_of_week.csv')
pd.DataFrame(list_of_accuracies).to_csv('generated/hist_accuracy.csv')