In [1]:
import pandas as pd
from pandas.io import gbq

from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [42]:
get_data_sql = """
SELECT
  SUM(user_count) AS user_count,
  SUM(streams) AS streams,
  user_dma_number,
  user_dma_name 
FROM
  `umg-comm-tech-dev.diaspora.india_all_rep_streams`
GROUP BY
  user_dma_number,
  user_dma_name
"""

In [43]:
data = gbq.read_gbq(get_data_sql, project_id='umg-comm-tech-dev', dialect = 'standard')

In [44]:
data.head()

Unnamed: 0,user_count,streams,user_dma_number,user_dma_name
0,2636,3831,639,"Jackson, TN"
1,44064,73041,658,Green Bay-Appleton
2,28082,46606,588,South Bend-Elkhart
3,13951,19671,692,Beaumont-Port Arthur
4,72875,107089,592,Gainesville


In [45]:
by_columns = ['user_dma_number','user_dma_name']

In [46]:
data_by_geo = data.groupby(by=by_columns)

In [47]:
df_by_geo = pd.DataFrame(data_by_geo.user_count.sum())
df_by_geo['streams'] = data_by_geo.streams.sum()
df_by_geo = df_by_geo.reset_index()
df_by_geo.head()

Unnamed: 0,user_dma_number,user_dma_name,user_count,streams
0,,,38000206,60013139
1,500.0,Portland-Auburn,39786,71372
2,501.0,New York,1578756,2683043
3,502.0,Binghamton,15320,23865
4,503.0,Macon,9512,14355


In [20]:
get_bench_sql = """
SELECT * FROM `umg-comm-tech-dev.diaspora.benchmark`"""

benchmark = gbq.read_gbq(get_bench_sql, project_id='umg-comm-tech-dev', dialect = 'standard')

In [21]:
benchmark.head()

Unnamed: 0,user_count,streams,user_dma_number,user_dma_name,user_country_code,user_country_name,stream_country_code,stream_country_name,user_region_code,user_postal_code,user_gender,user_age_group
0,1,23,,,SV,El Salvador,SV,El Salvador,SV-LI,99,female,25-34
1,7631,665437,,,CR,Costa Rica,CR,Costa Rica,CR-SJ,12,male,35-44
2,3,147,,,VN,Vietnam,VN,Vietnam,VN-43,79,male,18-24
3,3,25,,,CR,Costa Rica,CR,Costa Rica,CR-H,61,male,35-44
4,4,109,,,BG,Bulgaria,BG,Bulgaria,BG-04,11,male,25-34


In [48]:
benchmark_by_geo = benchmark.groupby(by=by_columns)

In [49]:
df_benchmark = pd.DataFrame(benchmark_by_geo.user_count.sum())
df_benchmark['streams'] = benchmark_by_geo.streams.sum()
df_benchmark = df_benchmark.reset_index()
df_benchmark.head()

Unnamed: 0,user_dma_number,user_dma_name,user_count,streams
0,,,329751883,19932797911
1,500.0,Portland-Auburn,181295,17482670
2,501.0,New York,6288746,663939045
3,502.0,Binghamton,53955,5074013
4,503.0,Macon,71332,4812658


In [50]:
df_benchmark.describe()

Unnamed: 0,user_count,streams
count,212.0,212.0
mean,2098605.28,139023577.15
std,22648798.08,1370217029.41
min,803.0,67564.0
25%,55539.0,4748639.25
50%,168413.5,12006749.5
75%,447941.5,29008791.0
max,329751883.0,19932797911.0


In [51]:
print(df_benchmark.user_count.sum())
print(df_benchmark.streams.sum())

444904319
29472998355


In [52]:
all_data = pd.merge(df_by_geo, df_benchmark, how='right', on=by_columns, suffixes=['_indian', '_global'])
all_data.head()

Unnamed: 0,user_dma_number,user_dma_name,user_count_indian,streams_indian,user_count_global,streams_global
0,,,38000206,60013139,329751883,19932797911
1,500.0,Portland-Auburn,39786,71372,181295,17482670
2,501.0,New York,1578756,2683043,6288746,663939045
3,502.0,Binghamton,15320,23865,53955,5074013
4,503.0,Macon,9512,14355,71332,4812658


In [33]:
all_data_minus_india = all_data[all_data.stream_country_name!='India']

In [53]:
all_data_minus_null = all_data.iloc[1:,:]
all_data_minus_null.head()

Unnamed: 0,user_dma_number,user_dma_name,user_count_indian,streams_indian,user_count_global,streams_global
1,500,Portland-Auburn,39786,71372,181295,17482670
2,501,New York,1578756,2683043,6288746,663939045
3,502,Binghamton,15320,23865,53955,5074013
4,503,Macon,9512,14355,71332,4812658
5,504,Philadelphia,583729,922305,2753208,221478903


In [54]:
to_standardize = ['user_count_indian','streams_indian','user_count_global','streams_global']

In [57]:
# Standardising the variables

scaler = StandardScaler()
data_st=all_data_minus_null.copy()
data_st[to_standardize] = scaler.fit_transform(data_st[to_standardize])
data_st[to_standardize] = pd.DataFrame(data_st[to_standardize],columns=to_standardize)

data_st.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,user_dma_number,user_dma_name,user_count_indian,streams_indian,user_count_global,streams_global
1,500,Portland-Auburn,-0.26,-0.24,-0.27,-0.25
2,501,New York,6.16,6.32,4.33,5.68
3,502,Binghamton,-0.36,-0.36,-0.37,-0.37
4,503,Macon,-0.39,-0.38,-0.36,-0.37
5,504,Philadelphia,2.01,1.9,1.67,1.62


In [58]:
data_st['users_local_minus_global'] = data_st['user_count_indian']-data_st['user_count_global']
data_st['streams_local_minus_global'] = data_st['streams_indian']-data_st['streams_global']

In [39]:
all_data[all_data.stream_country_name=='India']

Unnamed: 0,stream_country_code,stream_country_name,user_count_indian,streams_indian,user_count_global,streams_global
98,IN,India,2658298.0,3714040.0,2592541,113525937


In [59]:
data_st.head()

Unnamed: 0,user_dma_number,user_dma_name,user_count_indian,streams_indian,user_count_global,streams_global,users_local_minus_global,streams_local_minus_global
1,500,Portland-Auburn,-0.26,-0.24,-0.27,-0.25,0.01,0.02
2,501,New York,6.16,6.32,4.33,5.68,1.83,0.64
3,502,Binghamton,-0.36,-0.36,-0.37,-0.37,0.01,0.01
4,503,Macon,-0.39,-0.38,-0.36,-0.37,-0.03,-0.01
5,504,Philadelphia,2.01,1.9,1.67,1.62,0.34,0.28


In [60]:
gbq.to_gbq(data_st, project_id='umg-comm-tech-dev', destination_table='diaspora.all_indian_minus_bench_std_dma',
          if_exists = 'replace')

1it [00:11, 11.55s/it]


In [70]:
artists = gbq.read_gbq("SELECT * FROM `umg-comm-tech-dev.diaspora.india_artists_genres`",
                      project_id='umg-comm-tech-dev', dialect='standard')

In [71]:
artists.describe(include='all')

Unnamed: 0,artist_name,genres,followers,popularity,uri
count,2131,2131,2131.0,2131.0,2131
unique,2126,245,,,2131
top,Sceptre,indian folk,,,spotify:artist:2JV7rasOwDzJsILh5eQWzA
freq,2,169,,,1
mean,,,2419.27,13.61,
std,,,17042.0,14.89,
min,,,0.0,0.0,
25%,,,18.0,0.0,
50%,,,124.0,8.0,
75%,,,986.5,25.0,


In [73]:
by_playlist_uri = data.groupby(by='source_uri')

In [79]:
top_playlists = pd.DataFrame(by_playlist_uri.streams.sum().reset_index()).sort_values(by='streams', ascending=False)

In [80]:
gbq.to_gbq(top_playlists, project_id='umg-comm-tech-dev', destination_table='diaspora.playlists_top13',
          if_exists = 'replace')

1it [00:04,  4.57s/it]
