In [3]:
import csv
import pandas as pd
from os import listdir
from os.path import isfile, join

In [4]:
MAIN_FILE = 'D:/nielsen/data/nielsen.csv' #total : 167.944.230 rows
DATA_PATH = 'D:/nielsen/data'

In [5]:
df =pd.read_csv(MAIN_FILE,nrows=1000)

In [6]:
df.head()

Unnamed: 0,site_name,first_timeframe,dwell_time_s,device_id,visitor
0,BF Karlsruhe Kaiserstr (1122),2014-12-31 23:00:30+00:00,15,bd5d8c2890622782d681c82f4dd84db4,True
1,BF Karlsruhe Kaiserstr (1122),2014-12-31 23:00:40+00:00,1080,428fa91d6d741e1466b4bcd917dff4c2,True
2,BF Karlsruhe Kaiserstr (1122),2014-12-31 23:00:55+00:00,15,e23bf2f0a5890f45e2494e1f5f3b92bb,False
3,BF Karlsruhe Kaiserstr (1122),2014-12-31 23:00:55+00:00,15,b83023972532c891861cb8b2723a2339,True
4,BF Karlsruhe Kaiserstr (1122),2014-12-31 23:01:20+00:00,15,c08478aa5c8b9c16737881c976548107,True


## 1 - group data per device and site

In [7]:
# Step 1 - Aggregate data per device_id adn site name

def chunk_and_aggregate():
    chunks=pd.read_csv(MAIN_FILE,chunksize=2000000)
    for i,chunk in enumerate(chunks):
        aggregate_per_device_and_site(chunk,i)
    

def aggregate_per_device_and_site(df,i):
    
    agg = df.groupby([df.device_id,df.site_name])['first_timeframe'].count()
    agg.to_csv('D:/nielsen/data/'+"agg_per_device_and_site"+str(i)+".csv")
    

def concatenate_aggregation_files(folder_name, target_name):
    path = DATA_PATH+folder_name
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    df = pd.DataFrame()
    for filo in onlyfiles:
        
        df1 = pd.read_csv(path+"/"+filo, header=None)
        print df1.shape
        df = pd.concat([df,df1], axis=0, ignore_index=True)
    df.to_csv(DATA_PATH+"/"+target_name)    
    return df

In [8]:
def prepare_aggregation_file():
    chunk_and_aggregate()
    concatenate_aggregation_files("/agg_per_device_and_site","final_aggregation_per_device_and_site.csv")

## 2 - Create Index files for device and sites

In [9]:
def create_index_file():
    df = pd.read_csv(DATA_PATH+"/final_aggregation_per_device_and_site.csv")
   
    
    index_device = pd.DataFrame(df2.device_id.unique())
    index_sites = pd.DataFrame(df2.site_name.unique())
    
    index_device.columns = ['device_mac']
    index_sites.columns = ['site_name']
    
    index_device.to_csv(DATA_PATH+"/index_devices.csv")
    index_sites.to_csv(DATA_PATH+"/index_sites.csv")
    del df2   
    

## 3 - Merge aggregated data with index files to simplify data.

In [10]:
def merge_agg_with_device_and_sites_index():
    chunks=pd.read_csv(DATA_PATH+"/final_aggregation_per_device_and_site.csv",chunksize=2000000,index_col=0)
    
    df_devices_idx = pd.read_csv(DATA_PATH+"/index_devices.csv",index_col=0)
    df_devices_idx =df_devices_idx.reset_index()
    
    df_sites_index = pd.read_csv(DATA_PATH+"/index_sites.csv",index_col=0)
    df_sites_index= df_sites_index.reset_index()
    df = pd.DataFrame()
    
    for i,chunk in enumerate(chunks):
        chunk = pd.merge(chunk, df_devices_idx, left_on='device_id', right_on='device_mac',suffixes=('_agg', '_devicesidx'))
        chunk = chunk[['index','site_name','shows']]
        chunk = pd.merge(chunk, df_sites_index, on='site_name', suffixes=('_devices','_sites'))
        chunk=chunk[['index_devices', 'index_sites','shows']]
        df = pd.concat([df,chunk], axis=0, ignore_index=True)
    df.to_csv(DATA_PATH+'/aggregation_only_indexes.csv')
        

## 4 - Pivot table and concatenate

In [11]:
def chunk_and_pivot():
    chunks = pd.read_csv(DATA_PATH+'/aggregation_only_indexes.csv',chunksize=1000000, index_col=0)
        
    for i,chunk in enumerate(chunks):
        chunk = chunk.astype("int32")
        
        df1 = chunk.pivot_table(values='shows',index='index_devices', columns='index_sites')
        df1.to_csv(DATA_PATH+"/pivot/pivot_indexes"+str(i)+".csv")      
       

In [16]:
def test_if_all_columns_are_stored(path):
    path = DATA_PATH+"/"+path+"/"
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    columns = []
    for file_name in onlyfiles:
        df = pd.read_csv(path+file_name,index_col=0,nrows=10)
        columns += df.columns.tolist()
        
    lista = map(int,columns)
    return  sorted(set(lista))

In [18]:
lista = test_if_all_columns_are_stored("pivot")
print lista

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52]


In [14]:
def aggregate_pivot(file_list, i,source_folder, target_folder):
    df =pd.DataFrame()
    for file_name in file_list:
        df1 = pd.read_csv(DATA_PATH+"/"+source_folder+"/"+file_name,index_col=0)
        df1 =df1.fillna(0)
        df1 =df1.astype('int32')
        print file_name, df.shape

        df = pd.concat([df,df1], axis=0, ignore_index=True)
        df = df.fillna(0)
        df =df.astype('int32')
    df.to_csv(DATA_PATH+"/"+target_folder+"/pivot_indexes"+str(i)+".csv")
    
def split_list(alist, wanted_parts):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]

    
def load_pivot_and_concate(group_size,source_folder,target_folder):
    path = DATA_PATH+"/"+source_folder
    all_files = [f for f in listdir(path) if isfile(join(path, f))]
    for i,sub_file_list in enumerate(split_list(all_files,group_size)):
        aggregate_pivot(sub_file_list,i, source_folder, target_folder)


In [34]:
def clean_final_pivot_all(folder_name):
    path = DATA_PATH+"/"+folder_name
    all_files = [f for f in listdir(path) if isfile(join(path, f))]
    
    for filo in all_files:
        df1 = pd.read_csv(path+"/"+filo, index_col=0)
    
        df_ubiquity = pd.DataFrame()
    
       
        df1 = df1.fillna(0)
        df1 = df1.astype('int32')
        df1.set_index('index_devices',inplace=True)
        cols = df1.columns.tolist()
        cols = map(int,cols)
        cols =sorted(cols)
        df1.columns=[cols]
        
        df1['total']=df1.count(axis=1)
        df1 =df1[chunk.total>1]
        df_ubiquity = pd.concat([df_ubiquity,chunk])
    df_ubiquity.to_csv(DATA_PATH+"/ubiquity"+filo)

    

In [38]:
#clean_final_pivot_all("pivot3")
df1 = pd.read_csv(DATA_PATH+"/pivot3/pivot_indexes1.csv", index_col=0)

In [39]:
df1.head()

Unnamed: 0,1,14,20,24,25,26,27,28,29,30,...,35,36,37,38,39,4,40,6,7,8
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df = pd.read_csv(DATA_PATH+"/final_pivot_all.csv",nrows=1000000,index_col=0)

df = df.fillna(0)
df = df.astype('int32')
df.set_index('index_devices',inplace=True)
cols = df.columns.tolist()
cols = map(int,cols)
cols = sorted(cols)
df.columns=[cols]
print cols
df['total']=df.astype(bool).sum(axis=1)
df.tail()


In [None]:
df = pd.read_csv(DATA_PATH+"/pivot/pivot_indexes0.csv",index_col=0)      

In [None]:
print sorted(df.columns.tolist())