In [21]:
import pandas as pd
import sqlite3
from datetime import datetime
from itertools import product
import os

In [22]:
data = pd.read_csv('events.csv')

In [23]:
data.head()

Unnamed: 0,content_ID,device_ID,event_type,event_time
0,16,1,start,2016-01-01 00:00:01
1,27,2,start,2016-01-01 00:00:01
2,78,3,start,2016-01-01 00:00:01
3,78,3,end,2016-01-01 00:00:28
4,54,3,start,2016-01-01 00:00:28


In [24]:
data['event_time']=pd.to_datetime(data['event_time'])

In [25]:
df_start = data[data['event_type'] == 'start'].sort_values(by=['content_ID','device_ID','event_time'])
df_end = data[data['event_type'] == 'end'].sort_values(by=['content_ID','device_ID','event_time'])

In [26]:
df_start.head()

Unnamed: 0,content_ID,device_ID,event_type,event_time
564,1,1,start,2016-01-01 00:47:25
886,1,1,start,2016-01-01 01:14:34
1500,1,1,start,2016-01-01 02:06:22
4242,1,1,start,2016-01-01 05:55:26
4378,1,1,start,2016-01-01 06:06:54


In [27]:
df_end.head()

Unnamed: 0,content_ID,device_ID,event_type,event_time
573,1,1,end,2016-01-01 00:48:05
895,1,1,end,2016-01-01 01:15:14
1505,1,1,end,2016-01-01 02:07:02
4249,1,1,end,2016-01-01 05:56:06
4383,1,1,end,2016-01-01 06:07:34


In [28]:
len(df_start.index)

259182

In [29]:
len(df_end.index)

259182

In [31]:
df_start = df_start.reset_index(drop=True)
df_end = df_end.reset_index(drop=True)
df_start['event_id'] = df_start.index
df_start['start_time'] = df_start['event_time']
df_end['end_time']=df_end['event_time']
df_start = df_start.drop(["event_type","event_time"], axis=1)
df_start.head()

Unnamed: 0,content_ID,device_ID,event_id,start_time
0,1,1,0,2016-01-01 00:47:25
1,1,1,1,2016-01-01 01:14:34
2,1,1,2,2016-01-01 02:06:22
3,1,1,3,2016-01-01 05:55:26
4,1,1,4,2016-01-01 06:06:54


In [32]:
# The data is consistent.
out = (df_start[['device_ID','content_ID']] == df_end[['device_ID','content_ID']]).all(axis=1)
len(df_start[out].index)


259182

In [33]:
len(df_start)

259182

In [34]:
df_end = df_end.drop(["event_type","event_time","content_ID","device_ID"],axis=1)
merged = pd.concat([df_start,df_end],axis =1)

In [38]:
# Validation
len(merged[merged['start_time']< merged['end_time']])
merged.head()

Unnamed: 0,content_ID,device_ID,event_id,start_time,end_time
0,1,1,0,2016-01-01 00:47:25,2016-01-01 00:48:05
1,1,1,1,2016-01-01 01:14:34,2016-01-01 01:15:14
2,1,1,2,2016-01-01 02:06:22,2016-01-01 02:07:02
3,1,1,3,2016-01-01 05:55:26,2016-01-01 05:56:06
4,1,1,4,2016-01-01 06:06:54,2016-01-01 06:07:34


In [41]:
conn = sqlite3.connect("advertima.db")
os.chmod('advertima.db',0o755)# Opens file if exists, else creates file
cur = conn.cursor()
merged.to_sql(name="event", con=conn, if_exists="append", index=False)

Writing Persons CSV to database

In [44]:
person = pd.read_csv('persons.csv')
person['id']=person.index
person['appears'] = pd.to_datetime(person['appears'])
person['disappears'] = pd.to_datetime(person['disappears'])
person = person.sort_values(by=['device_id','appears'])
person.head()
person.to_sql(name="person", con=conn, if_exists="append", index=False)


Checking that the tables are created properly

In [45]:
sql = 'select * from event limit 5;'
sql2 = 'select * from person limit 5;'
cur.execute(sql) 
cur.fetchall()
print("="*100)
cur.execute(sql2)
cur.fetchall()



[(1, '2016-01-01 00:00:31', '2016-01-01 00:01:14', 44, 'male', 1),
 (1, '2016-01-01 00:01:03', '2016-01-01 00:01:24', 23, 'male', 2),
 (1, '2016-01-01 00:01:23', '2016-01-01 00:01:30', 30, 'male', 4),
 (1, '2016-01-01 00:01:25', '2016-01-01 00:02:48', 70, 'female', 16),
 (1, '2016-01-01 00:01:33', '2016-01-01 00:02:02', 63, 'male', 6)]

Creating index on tables to reduce latency during retrival.

In [19]:
sql = "CREATE INDEX content_idx ON event (content_ID);"
cur.execute(sql) 

OperationalError: index content_idx already exists

In [20]:
sql = "CREATE INDEX pdevice_idx ON person (device_id);"
cur.execute(sql) 

OperationalError: index pdevice_idx already exists

In [32]:
conn = sqlite3.connect("advertima.db")
cur = conn.cursor()
args = {'device_id' :1, 'content_id':100, 'start_time':"2016-01-01 00:47:25", 'end_time':"2016-01-10 06:50:03"}
sql = "select start_time,end_time from event\
       where device_id = {device_id} and content_id={content_id} \
       and max(start_time, '{start_time}') < min(end_time, '{end_time}')".format(**args)
content_output = pd.read_sql(sql,conn).reset_index(drop=True)
content_output['start_time'] = pd.to_datetime(content_output['start_time'],format='%Y-%m-%d %H:%M:%S')
content_output['end_time'] = pd.to_datetime(content_output['end_time'],format='%Y-%m-%d %H:%M:%S')
args['end_time'] = str(max(content_output['end_time']))
print(args['end_time'])
len(content_output.index)

2016-01-10 02:03:08


596

In [33]:
#args = {'device_id' :1, 'content_id':1, 'start_time':"2016-01-01 00:47:25", 'end_time':"2016-01-01 06:50:31"}
sql = "select * from person where device_ID = {device_id} and appears between\
'{start_time}' and '{end_time}';".format(**args)
cur.execute(sql) 
person_output = pd.read_sql(sql,conn).reset_index(drop=True)
type(person_output['appears'][0])
person_output['appears'] = pd.to_datetime(person_output['appears'],format='%Y-%m-%d %H:%M:%S')
person_output['disappears'] = pd.to_datetime(person_output['disappears'],format='%Y-%m-%d %H:%M:%S')
len(person_output.index)

97396

In [None]:
import time
t0 = time.time()
from itertools import permutations
a = list(range(len(person_output.index)))
b = list(range(len(content_output.index)))
#len(list(product(a,b)))
count = 0; age = 0; male =0; female =0;avg_age=1;gender_distribution=0;
for i,j in product(a,b):
    #print(i,j)
    if max(person_output['appears'][i],content_output['start_time'][j]) <= \
    min(person_output['disappears'][i],content_output['end_time'][j]):
            count+=1
            if avg_age:
                age += person_output['age'][i]
            if gender_distribution:
                if person_output['gender'][i] == 'male':
                    male +=1 
                elif person_output['gender'][i] == 'female':
                    female +=1 
                else:
                    pass
t1 = time.time()
print(t1-t0)
if age > 0:
    print('Avg age:',age/count)
if 0 < male or female > 0:
    print('male {} female {}'.format(male/count,female/count))
print('Count:',count)


3004.696813106537
Avg age: 50.691141639
Count: 8444


In [None]:
import time
t0 = time.time()
count = 0; age = 0; male =0; female =0;avg_age=1;gender_distribution=0;
for j in b:
    for i in a:
        if max(person_output['appears'][i],content_output['start_time'][j]) <= \
        min(person_output['disappears'][i],content_output['end_time'][j]):
            count+=1
            if avg_age:
                age += person_output['age'][i]
            if gender_distribution:
                if person_output['gender'][i] == 'male':
                    male +=1 
                elif person_output['gender'][i] == 'female':
                    female +=1 
                else:
                    pass
t1 = time.time()
print(t1-t0)
if age > 0:
    print('Avg age:',age/count)
if 0 < male or female > 0:
    print('male {} female {}'.format(male/count,female/count))
print('Count:',count)

In [None]:
import multiprocessing
import time
t0 = time.time()
count = 0; age = 0; male =0; female =0;avg_age=1;gender_distribution=0;
for j in b:
    for i in a:
        if max(person_output['appears'][i],content_output['start_time'][j]) <= \
        min(person_output['disappears'][i],content_output['end_time'][j]):
            count+=1
            if avg_age:
                age += person_output['age'][i]
            if gender_distribution:
                if person_output['gender'][i] == 'male':
                    male +=1 
                elif person_output['gender'][i] == 'female':
                    female +=1 
                else:
                    pass
t1 = time.time()
print(t1-t0)
if age > 0:
    print('Avg age:',age/count)
if 0 < male or female > 0:
    print('male {} female {}'.format(male/count,female/count))
print('Count:',count)

In [18]:
df_start['id']=df_start.index

In [19]:
df_start['id']

0              0
1              1
2              2
3              3
4              4
5              5
6              6
7              7
8              8
9              9
10            10
11            11
12            12
13            13
14            14
15            15
16            16
17            17
18            18
19            19
20            20
21            21
22            22
23            23
24            24
25            25
26            26
27            27
28            28
29            29
           ...  
259152    259152
259153    259153
259154    259154
259155    259155
259156    259156
259157    259157
259158    259158
259159    259159
259160    259160
259161    259161
259162    259162
259163    259163
259164    259164
259165    259165
259166    259166
259167    259167
259168    259168
259169    259169
259170    259170
259171    259171
259172    259172
259173    259173
259174    259174
259175    259175
259176    259176
259177    259177
259178    259178
259179    2591