In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp

%matplotlib inline

---
## Auctions

In [2]:
#Optimizado para menos memoria
auction_dtypes = {
    'device_id': np.int64,
    'ref_type_id': np.int8,
    'source_id': np.int8
}

auctions = pd.read_csv('../data/auctions.csv.gzip',
                       compression = 'gzip',
                       dtype = auction_dtypes,
                       parse_dates = ['date'])

In [3]:
auctions.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47409528 entries, 0 to 47409527
Data columns (total 4 columns):
date           datetime64[ns]
device_id      int64
ref_type_id    int8
source_id      int8
dtypes: datetime64[ns](1), int64(1), int8(2)
memory usage: 813.8 MB


In [4]:
auctions.describe(include='all')

Unnamed: 0,date,device_id,ref_type_id,source_id
count,47409528,47409530.0,47409530.0,47409530.0
unique,47407803,,,
top,2019-04-23 21:46:17.200578,,,
freq,2,,,
first,2019-04-18 00:00:00.015050,,,
last,2019-04-26 23:59:59.969518,,,
mean,,4.614897e+18,1.895869,1.564526
std,,2.663068e+18,2.138371,1.823315
min,,40621410000000.0,1.0,0.0
25%,,2.296914e+18,1.0,1.0


In [5]:
auctions.head()

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-23 18:58:00.842116,2564673204772915246,1,0
1,2019-04-23 18:58:01.530771,4441121667607578179,7,0
2,2019-04-23 18:58:01.767562,7721769811471055264,1,0
3,2019-04-23 18:58:02.363468,6416039086842158968,1,0
4,2019-04-23 18:58:02.397559,1258642015983312729,1,0


In [None]:
# Tiene sentido usar ref_type? -> Si lo tiene
auctions['ref_type_id'].value_counts()

In [None]:
auctions_7 = auctions.set_index('ref_type_id').loc[7,'device_id']
auctions_7.loc[auctions_7.isin(auctions.set_index('ref_type_id').loc[1,'device_id'])].count()

---
## Installs

In [2]:
installs_dtypes = {
    'application_id': np.int32,
    'ref_type': np.int64,
    'ref_hash': np.int64, 
    'click_hash':'category',
    'attributed': 'category',
    'implicit': 'category',
    #'device_countrycode': 'object', 
    'device_brand': 'category',
    'device_model': 'category', 
    'session_user_agent': 'category', 
    'user_agent': 'category', 
    'event_uuid':'object',
    'kind': 'category',
    'wifi': 'category', 
    'trans_id': 'object', 
    #'ip_address':'object', 
    'device_language': 'category'
}
install_cols = list(installs_dtypes.keys()) + ['created']
installs = pd.read_csv('../data/installs.csv.gzip', 
                       compression='gzip', 
                       usecols=install_cols,
                       dtype= installs_dtypes,
                      parse_dates=['created'])

In [7]:
installs.describe(include='all')

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,device_language
count,481511,481511.0,481511.0,481511.0,1142,481511,481511,276443.0,454619.0,466672,330768,103168,103168,294829,8933.0,453934.0
unique,480962,,,,1142,2,2,1084.0,6472.0,4537,13727,103083,183,2,4053.0,223.0
top,2019-04-20 16:34:38.892000,,,,zHI_3kjRHcwd3IaU7mqg_xSQDN5lFrI,False,False,3.083058605577787e+17,6.794880020077885e+18,http-kit/2.0,Grability/17420 CFNetwork/978.0.7 Darwin/18.5.0,094d7046-fdb9-4e56-bfae-f6863fc8bc11,Open,true,0.0,6.977049253562486e+18
freq,4,,,,1,480369,378343,83551.0,46187.0,335311,10809,2,37257,235130,3145.0,233012.0
first,2019-04-18 00:00:01.560000,,,,,,,,,,,,,,,
last,2019-04-26 23:59:58.788000,,,,,,,,,,,,,,,
mean,,114.62761,1.823409e+18,4.60827e+18,,,,,,,,,,,,
std,,79.540714,1.496644e+17,2.664215e+18,,,,,,,,,,,,
min,,1.0,1.494519e+18,40621410000000.0,,,,,,,,,,,,
25%,,36.0,1.891515e+18,2.302034e+18,,,,,,,,,,,,


In [8]:
installs.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481511 entries, 0 to 481510
Data columns (total 16 columns):
created               481511 non-null datetime64[ns]
application_id        481511 non-null int32
ref_type              481511 non-null int64
ref_hash              481511 non-null int64
click_hash            1142 non-null category
attributed            481511 non-null category
implicit              481511 non-null category
device_brand          276443 non-null category
device_model          454619 non-null category
session_user_agent    466672 non-null category
user_agent            330768 non-null category
event_uuid            103168 non-null object
kind                  103168 non-null category
wifi                  294829 non-null category
trans_id              8933 non-null object
device_language       453934 non-null category
dtypes: category(10), datetime64[ns](1), int32(1), int64(2), object(2)
memory usage: 61.3 MB


In [9]:
installs.head()

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,device_language
0,2019-04-24 06:23:29.495,1,1494519392962156891,4716708407362582887,,False,True,,3.739127126472163e+17,adjust.com,,79837499-2f2a-4605-a663-e322f759424f,app_open,,,3.3013777759777e+18
1,2019-04-24 02:06:01.032,1,1494519392962156891,7143568733100935872,,False,False,,7.80553892759877e+18,adjust.com,,,,,,3.3013777759777e+18
2,2019-04-20 10:15:36.274,1,1494519392962156891,5230323462636548010,,False,True,,8.355495513718673e+18,adjust.com,,dda99e3c-9c4b-487d-891c-79f0a02cb4a8,app_open,,,4.060929664968129e+18
3,2019-04-20 21:56:47.151,1,1494519392962156891,5097163995161606833,,False,True,,2.3557720913769155e+18,adjust.com,,7010c3ce-0fcf-46c6-9be8-374cc0e20af4,app_open,,,3.3013777759777e+18
4,2019-04-20 22:40:41.239,1,1494519392962156891,6328027616411983332,,False,False,,6.156971151807135e+18,adjust.com,,,,,,3.3013777759777e+18


---
## Clicks

In [10]:
clicks_dtypes = {
    'advertiser_id': np.int8, 
    'action_id': np.int32, 
    'source_id': np.int8, 
    'latitude' : np.float64, 
    'longitude': np.float64, 
    'wifi_connection': np.bool, 
    'carrier_id': np.int16, 
    'trans_id': 'object',
    'os_minor':'category', 
    'agent_device' : 'category', 
    'os_major': 'category', 
    'specs_brand': 'category', 
    'brand': np.int8,
    'timeToClick': np.float64, 
    #'touchX': np.float64, 
    #'touchY': np.float64, 
    'ref_type':np.int64, 
    'ref_hash':np.int64
}
clicks = pd.read_csv('../data/clicks.csv.gzip', 
                     compression='gzip',
                     low_memory = False,
                     parse_dates=['created'])

clicks.touchX = clicks.touchX.apply(lambda x: np.float64(x))
clicks.touchY = clicks.touchY.apply(lambda x: np.float64(x))

In [11]:
#clicks.describe(include='all')
clicks.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64296 entries, 0 to 64295
Data columns (total 20 columns):
advertiser_id      64296 non-null int64
action_id          7 non-null float64
source_id          64296 non-null int64
created            64296 non-null datetime64[ns, UTC]
country_code       64296 non-null int64
latitude           64296 non-null float64
longitude          64296 non-null float64
wifi_connection    64296 non-null bool
carrier_id         63097 non-null float64
trans_id           64296 non-null object
os_minor           64261 non-null float64
agent_device       8920 non-null float64
os_major           64261 non-null float64
specs_brand        64296 non-null int64
brand              15035 non-null float64
timeToClick        38178 non-null float64
touchX             43678 non-null float64
touchY             43678 non-null float64
ref_type           64296 non-null int64
ref_hash           64296 non-null int64
dtypes: bool(1), datetime64[ns, UTC](1), float64(11), int64(

In [12]:
clicks.describe(include='all')

Unnamed: 0,advertiser_id,action_id,source_id,created,country_code,latitude,longitude,wifi_connection,carrier_id,trans_id,os_minor,agent_device,os_major,specs_brand,brand,timeToClick,touchX,touchY,ref_type,ref_hash
count,64296.0,7.0,64296.0,64296,64296.0,64296.0,64296.0,64296,63097.0,64296,64261.0,8920.0,64261.0,64296.0,15035.0,38178.0,43678.0,43678.0,64296.0,64296.0
unique,,,,64275,,,,2,,64279,,,,,,,,,,
top,,,,2019-04-23 15:15:05.754000+00:00,,,,True,,1wmAqiuxSnawN406a6xfWZ_xUNnS6ns,,,,,,,,,,
freq,,,,2,,,,40254,,2,,,,,,,,,,
first,,,,2019-04-12 00:00:01.981000+00:00,,,,,,,,,,,,,,,,
last,,,,2019-04-26 23:59:22.065000+00:00,,,,,,,,,,,,,,,,
mean,1.560595,122478.285714,1.251166,,6.287817e+18,1.74019,0.867035,,10.431558,,3.57675e+18,5.58575e+18,5.030899e+18,1.635554e+18,1.400798,206.954231,inf,inf,1.868027e+18,4.561908e+18
std,0.518691,8510.626039,1.407715,,1024.008,0.045593,0.017515,,14.161776,,2.320198e+18,2.432666e+18,1.151353e+18,1.743098e+18,1.798718,921.497533,,,9.36644e+16,2.662462e+18
min,0.0,103178.0,0.0,,6.287817e+18,1.660487,0.810223,,0.0,,5.106671e+17,2.812032e+16,6.900651e+17,7.191384e+16,0.0,0.013,0.0,0.0,1.494519e+18,693609700000000.0
25%,1.0,125695.0,0.0,,6.287817e+18,1.714512,0.863469,,1.0,,1.517644e+18,3.861844e+18,3.90839e+18,7.191384e+16,0.0,2.03025,0.361,0.105,1.891515e+18,2.236736e+18


In [13]:
clicks.head()

Unnamed: 0,advertiser_id,action_id,source_id,created,country_code,latitude,longitude,wifi_connection,carrier_id,trans_id,os_minor,agent_device,os_major,specs_brand,brand,timeToClick,touchX,touchY,ref_type,ref_hash
0,1,,2,2019-04-18 05:27:42.197000+00:00,6287817205707153877,1.714547,0.871535,False,3.0,9JMAfrb-b9cSEVCJb0P9JfihGthaS7E,1.517644e+18,,5.131616e+18,71913840936116953,0.0,2.317,0.968,0.503,1891515180541284343,1293710398598742392
1,1,,1,2019-04-18 05:27:03.164000+00:00,6287817205707153877,1.714512,0.871062,True,2.0,r3xtTRv2lInfiXG8JI3NQsNcBo8GyFQ,1.288578e+18,,3.90839e+18,3576558787748411622,1.0,7.653,0.712,1.689,1891515180541284343,1663930990551616564
2,1,,1,2019-04-18 05:42:07.926000+00:00,6287817205707153877,1.714547,0.871535,True,4.0,WOnHFqQtY48z_ygKZ-030U_g0TMGVMw,2.238736e+18,,3.581233e+18,3576558787748411622,,464.796,0.227,0.251,1891515180541284343,8488038938665586188
3,1,,1,2019-04-18 05:26:04.446000+00:00,6287817205707153877,1.708041,0.870772,True,1.0,wQMLLmYqiFhSuha9p9B13PMtcyBW_vM,2.41164e+18,,3.90839e+18,3576558787748411622,,225.311,0.696,6.587,1891515180541284343,6488361690105189959
4,1,,1,2019-04-18 05:23:37.764000+00:00,6287817205707153877,1.715514,0.870772,True,2.0,GeFoyBzMA7taylMxxjzlNPTU-n4FXFs,1.517644e+18,,5.131616e+18,3576558787748411622,0.0,84.736,0.059,0.142,1891515180541284343,1348993302102753419


---
## Events

In [14]:
events_dtypes = {
    'event_id': np.int64,
    'ref_type': np.int64,
    'ref_hash': np.int64,
    'application_id': np.int64,
    'attributed': np.bool,
    'device_os_version': 'category',
    'device_brand': 'category',
    'device_model': 'category',
    'device_city': 'category',
    'session_user_agent': 'category',
    'trans_id': 'category',
    'user_agent': 'category',
    'event_uuid': 'object',
    'carrier': 'category',
    'kind': 'category',
    'device_os': 'category',
    'wifi': np.bool,
    'connection_type': 'category',
    #'ip_address': np.int64,
    #'device_language': 'category'
}
events_cols = list(events_dtypes.keys()) + ['date']
events = pd.read_csv('../data/events.csv.gzip', 
                     compression='gzip',
                     dtype=events_dtypes,
                     usecols=events_cols,
                     parse_dates=['date'])

In [15]:
events.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7744581 entries, 0 to 7744580
Data columns (total 19 columns):
date                  datetime64[ns]
event_id              int64
ref_type              int64
ref_hash              int64
application_id        int64
attributed            bool
device_os_version     category
device_brand          category
device_model          category
device_city           category
session_user_agent    category
trans_id              category
user_agent            category
event_uuid            object
carrier               category
kind                  category
device_os             category
wifi                  bool
connection_type       category
dtypes: bool(2), category(11), datetime64[ns](1), int64(4), object(1)
memory usage: 1.1 GB


In [16]:
events.describe(include='all')

Unnamed: 0,date,event_id,ref_type,ref_hash,application_id,attributed,device_os_version,device_brand,device_model,device_city,session_user_agent,trans_id,user_agent,event_uuid,carrier,kind,device_os,wifi,connection_type
count,7744581,7744581.0,7744581.0,7744581.0,7744581.0,7744581,2332975.0,2553424.0,5668092.0,1894935.0,7702301.0,37642.0,3341483.0,7714809,1925901.0,7714809.0,1870190.0,7744581,1809296
unique,7693028,,,,,2,96.0,506.0,4720.0,753.0,2297.0,11042.0,14645.0,7714809,259.0,513.0,2.0,2,4
top,2019-04-24 14:41:30.558000,,,,,False,7.391843585977942e+18,3.083058605577787e+17,6.794880020077885e+18,4.007928953771406e+18,3.819516403548394e+18,0.0,7.683618815288324e+18,dc0781ce-3b79-4538-90d6-9748aca3b223,8.758762753139908e+18,4.017674184041173e+18,7.531669329342818e+18,False,Cable/DSL
freq,4,,,,,7714809,522543.0,739979.0,545900.0,621018.0,5365876.0,7446.0,91393.0,1,726738.0,1281105.0,1722255.0,5478103,1291512
first,2019-04-18 00:00:00.027000,,,,,,,,,,,,,,,,,,
last,2019-04-26 23:59:59.881000,,,,,,,,,,,,,,,,,,
mean,,59.31369,1.823697e+18,4.581569e+18,141.3932,,,,,,,,,,,,,,
std,,87.38097,1.49413e+17,2.658818e+18,76.09497,,,,,,,,,,,,,,
min,,0.0,1.494519e+18,40621410000000.0,1.0,,,,,,,,,,,,,,
25%,,2.0,1.891515e+18,2.275713e+18,68.0,,,,,,,,,,,,,,


In [17]:
events.head()

Unnamed: 0,date,event_id,ref_type,ref_hash,application_id,attributed,device_os_version,device_brand,device_model,device_city,session_user_agent,trans_id,user_agent,event_uuid,carrier,kind,device_os,wifi,connection_type
0,2019-04-20 01:42:49.120,0,1891515180541284343,5857744372586891366,210,False,,,4.318294190479584e+18,,3.819516403548394e+18,,5.046185273150854e+18,5b506964-5f47-4b28-a8c2-8a92d6c23379,,5.882882097123621e+18,,False,
1,2019-04-20 01:42:49.340,1,1891515180541284343,7642521036780133571,210,False,,,,,3.819516403548394e+18,,,f1fb9d15-1a7b-4116-8d3b-c4c403e197e2,,4.017674184041173e+18,,False,
2,2019-04-20 01:42:49.365,1,1891515180541284343,2548841562898283198,210,False,,,,,3.819516403548394e+18,,,c85a0b15-a5d7-472e-8116-6bfa3db19687,,4.017674184041173e+18,,False,
3,2019-04-20 01:42:51.438,2,1891515180541284343,609402887625919085,210,False,,,,,3.819516403548394e+18,,,f4aa0a97-2de6-4f22-95c6-1b3150112cb9,,6.168308581888314e+18,,False,
4,2019-04-20 01:42:51.838,1,1891515180541284343,9114651763556439823,210,False,,,,,3.819516403548394e+18,,,08e2f7f7-875f-4aa0-b337-b9b87b0d83ea,,4.017674184041173e+18,,False,
