In [1]:
import csv
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import PandasHelper as pdh

In [2]:
DATA_PATH = 'data/'
MAIN_FILE = DATA_PATH+'nielsen.csv'



In [3]:
df =pd.read_csv(MAIN_FILE,nrows=1000)

In [4]:
df.head(2)

Unnamed: 0,site_name,first_timeframe,dwell_time_s,device_id,visitor
0,BF Karlsruhe Kaiserstr (1122),2014-12-31 23:00:30+00:00,15,bd5d8c2890622782d681c82f4dd84db4,True
1,BF Karlsruhe Kaiserstr (1122),2014-12-31 23:00:40+00:00,1080,428fa91d6d741e1466b4bcd917dff4c2,True


# 1-Create indexes table

In [5]:
def chunk_and_retrieve_indexes():
    chunks = pd.read_csv(MAIN_FILE,chunksize=1000000)
    
    for i,chunk in enumerate(chunks):
        print chunk.shape
        devices_ix = pd.DataFrame(chunk.device_id.unique())
        sites_ix = pd.DataFrame(chunk.site_name.unique())
        devices_ix.to_csv(DATA_PATH+"indexes/devices/devices_ix_"+str(i)+".csv")
        sites_ix.to_csv(DATA_PATH+"indexes/sites/sites_ix_.csv"+str(i)+".csv")

def concatenate_df(df,path):
    df1 = pd.read_csv(path,index_col=0)
   
    return pd.concat([df,df1]).drop_duplicates().reset_index(drop=True)

def concatenate_index_files():
    index_devices_files = pdh.get_files(DATA_PATH+"indexes/devices/")
    index_sites_files = pdh.get_files(DATA_PATH+"indexes/sites/")
    
    df_devices_ix = pd.DataFrame()
    df_sites_ix = pd.DataFrame()
    
    for index_device_file in index_devices_files:
        print index_device_file
        df_devices_ix = concatenate_df(df_devices_ix,DATA_PATH+"indexes/devices/"+index_device_file)
    df_devices_ix.columns=['id','device_mac']
    df_devices_ix.to_csv(DATA_PATH+"indexes/devices_ix.csv")
    
    for index_sites_file in index_sites_files:
        df_sites_ix = concatenate_df(df_sites_ix,DATA_PATH+"indexes/sites/"+index_sites_file)
    df_sites_ix.columns=['id','site_name']
    df_sites_ix.to_csv(DATA_PATH+"indexes/sites_ix.csv")

# 2-replace devices and sites name with ids


In [6]:
def merge_agg_with_device_and_sites_index():
    chunks=pd.read_csv(DATA_PATH+"nielsen.csv",chunksize=1000000)
    df_devices_idx = pd.read_csv(DATA_PATH+"indexes/devices_ix.csv")
    df_sites_index = pd.read_csv(DATA_PATH+"indexes/sites_ix.csv")
    df = pd.DataFrame()
    
    for i,chunk in enumerate(chunks):
        print chunk.shape, i
        chunk = pd.merge(chunk, df_devices_idx, left_on='device_id', right_on='device_mac',suffixes=('_agg', '_devicesidx'))
        chunk = chunk[['id','site_name','first_timeframe']]
        chunk = pd.merge(chunk, df_sites_index, on='site_name', suffixes=('_devices','_sites'))
        chunk=chunk[['id_devices', 'id_sites']]
        chunk=chunk.astype('int32')
        df = pd.concat([df,chunk], axis=0, ignore_index=True)
    print "merge over,starting saving to disk..."
    df.to_csv(DATA_PATH+'/nielsen_indexes.csv')
    print "saved to disk OK."

# 3-Aggregate per devices and site

In [7]:
df_test = pd.read_csv(DATA_PATH+'/nielsen_indexes.csv',nrows=5000000,index_col=0)
df_test.to_csv(DATA_PATH+'/nielsen_indexes_light.csv')

In [8]:
def aggregate(chunk):
    
    chunk['count']=1
    groupy = chunk.groupby(['id_devices','id_sites']).count()
    groupy = groupy.reset_index()
    
    groupy =groupy.groupby('id_devices')['id_sites'].apply(lambda x: x.tolist())
    return pd.DataFrame(groupy)

def aggregate_per_devices_and_sites():
    chunks = pd.read_csv(DATA_PATH+'/nielsen_indexes.csv',chunksize=40000000,index_col=0)
    df =pd.DataFrame()
    
    for i,chunk in enumerate(chunks):
        df = pd.concat([df,aggregate(chunk)], axis=1, ignore_index=False)
        df = df.fillna(0)
        
        print df.shape,i
    df.to_csv(DATA_PATH+'/nielsen_indexes_pivot.csv')

# 4- Clean data


In [14]:
def combine_sites_id(df):
    cols = df.columns.tolist()
    df = df.replace('0', 0)
    df =df.replace(0,"")
    df = df.fillna("")
    df['sites_id']= df[cols].astype(str).sum(axis=1)
    df['sites_id_array']= df['sites_id'].apply(pdh.string_to_np_array)
    df['sites_count']= df['sites_id_array'].apply(lambda x: x.size)
    df = df[['sites_id_array','sites_count']]
    df.reset_index()
    return df
    
    
def clean_data():
    chunks = pd.read_csv(DATA_PATH+'nielsen_indexes_pivot.csv',chunksize=1000000,index_col=0)
    df = pd.DataFrame()
    
    for i,chunk in enumerate(chunks):
        df = pd.concat([df,combine_sites_id(chunk)],axis=0)
        print df.shape,i
    print "Combining done, saving to file..."
    df.to_csv(DATA_PATH+"nielsen_indexes_sites_per_devices.csv")
    
    

In [None]:
clean_data()

(1000000, 2) 0
(2000000, 2) 1
(3000000, 2) 2
(4000000, 2) 3
(5000000, 2) 4
(6000000, 2) 5
(7000000, 2) 6
(8000000, 2) 7
(9000000, 2) 8
(10000000, 2) 9
(11000000, 2) 10
(12000000, 2) 11
(13000000, 2) 12
(14000000, 2) 13
(15000000, 2) 14
(16000000, 2) 15
(17000000, 2) 16
(18000000, 2) 17
(19000000, 2) 18


  exec(code_obj, self.user_global_ns, self.user_ns)


(20000000, 2) 19


  exec(code_obj, self.user_global_ns, self.user_ns)


(21000000, 2) 20
(21010205, 2) 21
