In [35]:
import pandas as pd
from pandas.io import gbq

from sklearn.cluster import DBSCAN
from sklearn import metrics

In [5]:
user_data = gbq.read_gbq('SELECT * FROM `umg-comm-tech-dev.adhoc.disapora_user_stats2`', 
                         project_id='umg-comm-tech-dev', dialect='standard')

In [6]:
user_data.describe(include='all')

Unnamed: 0,f0_,user_dma_number,user_dma_name,user_region_code,user_postal_code,user_gender,user_age_group,stream_source,device_type,os_name,partner_product,partner_access_type,partner_user_type,revenue_model,consumer_group,consumer_group_detail,engagement_style
count,532883.0,532883.0,532883,532883.0,532883.0,532883,532883,532883,532883,532883,532883.0,532883,532883,532883,532883,532883,532883
unique,,425.0,211,1.0,925.0,3,9,9,8,7,14.0,3,5,2,3,8,2
top,,803.0,Los Angeles,,,male,18-24,others_playlist,cell phone,iOS,,premium,paid,Premium,Paid,Standard,Lean Forward
freq,,49758.0,49758,532883.0,86950.0,282362,206854,160897,304675,235866,275613.0,416124,397865,416230,402701,164743,268671
mean,18.959303,,,,,,,,,,,,,,,,
std,154.688909,,,,,,,,,,,,,,,,
min,1.0,,,,,,,,,,,,,,,,
25%,1.0,,,,,,,,,,,,,,,,
50%,2.0,,,,,,,,,,,,,,,,
75%,7.0,,,,,,,,,,,,,,,,


In [7]:
user_data = user_data.rename(columns={'f0_':'user_count'})

In [14]:
user_data.consumer_group_detail.unique()

array(['Family', 'Standard', 'Discounted', 'Ad-Funded', 'Introductory',
       'Trial - 30 Days - Credit Card', 'Trial - 7 Days - No Credit Card',
       'Bundle'], dtype=object)

In [8]:
user_data.columns

Index(['user_count', 'user_dma_number', 'user_dma_name', 'user_region_code',
       'user_postal_code', 'user_gender', 'user_age_group', 'stream_source',
       'device_type', 'os_name', 'partner_product', 'partner_access_type',
       'partner_user_type', 'revenue_model', 'consumer_group',
       'consumer_group_detail', 'engagement_style'],
      dtype='object')

In [28]:
cols = ['user_count', 'user_dma_number', 'user_dma_name',
       'user_postal_code', 'user_gender', 'user_age_group', 'device_type', 'os_name']

In [29]:
user_data_agg = user_data[cols]

In [30]:
user_data_gb = user_data_agg.groupby(by = ['user_dma_number', 'user_dma_name',
       'user_postal_code', 'user_gender', 'user_age_group', 'device_type', 'os_name'])

In [31]:
user_data_final = user_data_gb.user_count.sum().reset_index().sort_values(by='user_count', ascending=False)

In [49]:
user_data_final.to_csv('user_data_summary.csv', index=False)

In [39]:
user_data_final.head()

Unnamed: 0,user_dma_number,user_dma_name,user_postal_code,user_gender,user_age_group,device_type,os_name,user_count
807,501,New York,,female,25-34,cell phone,iOS,66299
131279,803,Los Angeles,,female,25-34,cell phone,iOS,60423
141307,807,San Francisco-Oak-San Jose,,female,25-34,cell phone,iOS,56535
887,501,New York,,male,25-34,cell phone,iOS,54783
141393,807,San Francisco-Oak-San Jose,,male,25-34,cell phone,iOS,54372


In [61]:
user_data_post = user_data_final.groupby(by=['user_postal_code'])

In [62]:
by_post = user_data_post.user_count.sum().reset_index().sort_values(by='user_count', ascending=False)

In [63]:
by_post.to_csv('by_postcode_summary.csv')

In [46]:
all_users = by_post.user_count.sum()

In [48]:
by_post['user_perc'] = by_post.user_count/all_users
by_post.head()

Unnamed: 0,user_postal_code,user_gender,user_age_group,device_type,os_name,user_count,user_perc
7571,123,female,25-34,cell phone,iOS,65844,0.011432
7582,123,female,35-44,cell phone,iOS,51186,0.008887
7559,123,female,18-24,cell phone,iOS,49742,0.008636
5451,100,female,25-34,cell phone,iOS,41712,0.007242
7657,123,male,25-34,cell phone,iOS,37448,0.006502


In [50]:
by_post.shape

(50995, 7)

In [52]:
by_dma_post = user_data_agg.groupby(by=['user_dma_number', 'user_dma_name','user_postal_code']).user_count.sum().reset_index().sort_values(by='user_count', ascending=False)

In [55]:
by_dma_post['user_%'] = by_dma_post.user_count/all_users*100
by_dma_post

Unnamed: 0,user_dma_number,user_dma_name,user_postal_code,user_count,user_%
23379,803,Los Angeles,,883501,15.339004
24143,807,San Francisco-Oak-San Jose,,745386,12.941104
162,501,New York,,683728,11.870622
27965,Unknown,,,370687,6.435725
24029,803,Los Angeles,917,174799,3.034793
24780,807,San Francisco-Oak-San Jose,941,150623,2.615058
264,501,New York,112,149242,2.591082
25028,819,Seattle-Tacoma,,144293,2.505159
252,501,New York,100,142969,2.482173
1787,506,Boston (Manchester),,131195,2.277757


In [56]:
by_dma_post.to_csv('dma_postcode_summary.csv')

In [57]:
by_age_gender = user_data_agg.groupby(by=['user_gender', 'user_age_group']).user_count.sum().reset_index().sort_values(by='user_count', ascending=False)

In [58]:
by_age_gender['user_%'] = by_age_gender.user_count/all_users*100
by_age_gender.to_csv('age_gender_summary.csv')

In [59]:
by_device_os = user_data_agg.groupby(by=['device_type', 'os_name']).user_count.sum().reset_index().sort_values(by='user_count', ascending=False)

In [60]:
by_device_os['user_%'] = by_device_os.user_count/all_users*100
by_device_os.to_csv('device_os_summary.csv')