In [1]:
import pandas as pd
from pandas.io import gbq

from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [3]:
get_data_sql = """
SELECT * FROM `umg-comm-tech-dev.diaspora.india_all_rep_streams`"""

In [None]:
data = gbq.read_gbq(get_data_sql, project_id='umg-comm-tech-dev', dialect = 'standard')

In [None]:
data.head()

In [52]:
by_columns = ['stream_country_code','stream_country_name']

In [53]:
data_by_geo = data.groupby(by=by_columns)

In [54]:
df_by_geo = pd.DataFrame(data_by_geo.user_count.sum())
df_by_geo['streams'] = data_by_geo.streams.sum()
df_by_geo = df_by_geo.reset_index()
df_by_geo.head()

Unnamed: 0,user_dma_number,user_dma_name,user_count,streams
0,,,1125836,2193762
1,500.0,Portland-Auburn,50,205
2,501.0,New York,25908,65419
3,502.0,Binghamton,115,321
4,503.0,Macon,120,218


In [13]:
get_bench_sql = """
SELECT * FROM `umg-comm-tech-dev.diaspora.benchmark`"""

benchmark = gbq.read_gbq(get_bench_sql, project_id='umg-comm-tech-dev', dialect = 'standard')

In [14]:
benchmark.head()

Unnamed: 0,user_count,streams,user_dma_number,user_dma_name,user_country_code,user_country_name,stream_country_code,stream_country_name,user_region_code,user_postal_code,user_gender,user_age_group
0,1,23,,,SV,El Salvador,SV,El Salvador,SV-LI,99,female,25-34
1,7631,665437,,,CR,Costa Rica,CR,Costa Rica,CR-SJ,12,male,35-44
2,3,147,,,VN,Vietnam,VN,Vietnam,VN-43,79,male,18-24
3,3,25,,,CR,Costa Rica,CR,Costa Rica,CR-H,61,male,35-44
4,4,109,,,BG,Bulgaria,BG,Bulgaria,BG-04,11,male,25-34


In [55]:
benchmark_by_geo = benchmark.groupby(by=by_columns)

In [56]:
df_benchmark = pd.DataFrame(benchmark_by_geo.user_count.sum())
df_benchmark['streams'] = benchmark_by_geo.streams.sum()
df_benchmark = df_benchmark.reset_index()
df_benchmark.head()

Unnamed: 0,user_dma_number,user_dma_name,user_count,streams
0,,,329751883,19932797911
1,500.0,Portland-Auburn,181295,17482670
2,501.0,New York,6288746,663939045
3,502.0,Binghamton,53955,5074013
4,503.0,Macon,71332,4812658


In [57]:
df_benchmark.describe()

Unnamed: 0,user_count,streams
count,212.0,212.0
mean,2098605.28,139023577.15
std,22648798.08,1370217029.41
min,803.0,67564.0
25%,55539.0,4748639.25
50%,168413.5,12006749.5
75%,447941.5,29008791.0
max,329751883.0,19932797911.0


In [58]:
print(df_benchmark.user_count.sum())
print(df_benchmark.streams.sum())

444904319
29472998355


In [59]:
all_data = pd.merge(df_by_geo, df_benchmark, how='right', on=by_columns, suffixes=['_indian', '_global'])
all_data.head()

Unnamed: 0,user_dma_number,user_dma_name,user_count_indian,streams_indian,user_count_global,streams_global
0,,,1125836.0,2193762.0,329751883,19932797911
1,500.0,Portland-Auburn,50.0,205.0,181295,17482670
2,501.0,New York,25908.0,65419.0,6288746,663939045
3,502.0,Binghamton,115.0,321.0,53955,5074013
4,503.0,Macon,120.0,218.0,71332,4812658


In [65]:
all_data_minus_null = all_data.iloc[1:,:]
all_data_minus_null.head()

Unnamed: 0,user_dma_number,user_dma_name,user_count_indian,streams_indian,user_count_global,streams_global
1,500,Portland-Auburn,50.0,205.0,181295,17482670
2,501,New York,25908.0,65419.0,6288746,663939045
3,502,Binghamton,115.0,321.0,53955,5074013
4,503,Macon,120.0,218.0,71332,4812658
5,504,Philadelphia,5461.0,12026.0,2753208,221478903


In [63]:
to_standardize = ['user_count_indian','streams_indian','user_count_global','streams_global']

In [66]:
# Standardising the variables

scaler = StandardScaler()
data_st=all_data_minus_null.copy()
data_st[to_standardize] = scaler.fit_transform(data_st[to_standardize])
data_st[to_standardize] = pd.DataFrame(data_st[to_standardize],columns=to_standardize)

data_st.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,user_dma_number,user_dma_name,user_count_indian,streams_indian,user_count_global,streams_global
1,500,Portland-Auburn,-0.22,-0.18,-0.27,-0.25
2,501,New York,6.17,5.39,4.33,5.68
3,502,Binghamton,-0.2,-0.17,-0.37,-0.37
4,503,Macon,-0.2,-0.18,-0.36,-0.37
5,504,Philadelphia,1.12,0.83,1.67,1.62


In [67]:
data_st['users_local_minus_global'] = data_st['user_count_indian']-data_st['user_count_global']
data_st['streams_local_minus_global'] = data_st['streams_indian']-data_st['streams_global']

In [68]:
data_st.head()

Unnamed: 0,user_dma_number,user_dma_name,user_count_indian,streams_indian,user_count_global,streams_global,users_local_minus_global,streams_local_minus_global
1,500,Portland-Auburn,-0.22,-0.18,-0.27,-0.25,0.05,0.08
2,501,New York,6.17,5.39,4.33,5.68,1.84,-0.29
3,502,Binghamton,-0.2,-0.17,-0.37,-0.37,0.17,0.2
4,503,Macon,-0.2,-0.18,-0.36,-0.37,0.16,0.19
5,504,Philadelphia,1.12,0.83,1.67,1.62,-0.55,-0.79


In [69]:
gbq.to_gbq(data_st, project_id='umg-comm-tech-dev', destination_table='diaspora.indian_minus_bench_std_dma',
          if_exists = 'replace')

1it [00:06,  6.82s/it]


In [70]:
artists = gbq.read_gbq("SELECT * FROM `umg-comm-tech-dev.diaspora.india_artists_genres`",
                      project_id='umg-comm-tech-dev', dialect='standard')

In [71]:
artists.describe(include='all')

Unnamed: 0,artist_name,genres,followers,popularity,uri
count,2131,2131,2131.0,2131.0,2131
unique,2126,245,,,2131
top,Sceptre,indian folk,,,spotify:artist:2JV7rasOwDzJsILh5eQWzA
freq,2,169,,,1
mean,,,2419.27,13.61,
std,,,17042.0,14.89,
min,,,0.0,0.0,
25%,,,18.0,0.0,
50%,,,124.0,8.0,
75%,,,986.5,25.0,


In [73]:
by_playlist_uri = data.groupby(by='source_uri')

In [79]:
top_playlists = pd.DataFrame(by_playlist_uri.streams.sum().reset_index()).sort_values(by='streams', ascending=False)

In [80]:
gbq.to_gbq(top_playlists, project_id='umg-comm-tech-dev', destination_table='diaspora.playlists_top13',
          if_exists = 'replace')

1it [00:04,  4.57s/it]
