# YouTube Premium Japan Q1 2020

## Selection of Exposed Markets for MMT

### Approach

Get demo/DAVs/etc. for each population
Based on audience
 1. Heavy YouTube Users (HYU) / Core and Emerging Users
 2. 18-54, 18-34 preferred
 3. Slight skew for males
Rank markets, applying more weight to HYU and Age?

Bryan mentioned acquisition of HYU users.
Priority will be on raw number of HYU users in a prefecture.


#### Import libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from authenticator import Authenticator
from sheetmanager import SheetManager

#### Initialize necessary strings

In [3]:
keys = 'credentials.json'
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

spreadsheetId = '1QbkOSgaHt75LeQY2a9GeUjHajXIHWv3Z-ID7svUPG6w'
dav_data_range = 'JP'
cce_data_range = 'CCE Split - JP by region!A2:D50'
pop_data_range = 'Population by Prefecture!O5:Q52'
age_data_range = 'Population by Prefecture by Age Group!B5:Y53'

#### Authenticate and Initialize Manager to work with Google Sheets

In [4]:
authenticator = Authenticator(keys)
creds = authenticator.get_creds(SCOPES)
manager = SheetManager(creds)

#### Load in data from G sheets as pandas dataframes

In [5]:
# load in the dav figures
dav_df = manager.get_values(sheetId=spreadsheetId,
                            data_range=dav_data_range,
                            )

dav_df = dav_df[dav_df['region_name'] != 'null']
dav_df['date'] = pd.to_datetime(dav_df['date'], format='%Y%m%d')
dav_df = dav_df.set_index('region_name')

r_df = dav_df.reset_index()[['date', 'region_name', 'total']]
r_df.to_csv('dav_data.csv', index=False, encoding='utf-8-sig')
#r_df.to_csv('dav_data.csv', index=False)
r_df.head()

Unnamed: 0,date,region_name,total
0,2019-09-27,Aichi Prefecture,3304533
1,2019-09-27,Akita Prefecture,230200
2,2019-09-27,Aomori Prefecture,318274
3,2019-09-27,Chiba Prefecture,1979841
4,2019-09-27,Ehime Prefecture,381356


In [6]:
# load in the cce split
cce_df = manager.get_values(sheetId=spreadsheetId,
                                data_range=cce_data_range,
                                as_df=True)

cce_df = cce_df[cce_df['region_name'] != 'null']
cce_df = cce_df.set_index('region_name')

In [7]:
# load in pop figures
pop_df = manager.get_values(sheetId=spreadsheetId,
                           data_range=pop_data_range)

pop_df = pop_df.set_index('region_name')

In [8]:
# load in age figures

age_df = manager.get_values(sheetId=spreadsheetId,
                           data_range=age_data_range)

age_df = age_df[age_df['Prefecture in EN'] != 'Total']
age_df = age_df.set_index('Prefecture in EN')

# we only want 18-54

age_df = age_df[['15～19歳', '20～24歳', '25～29歳', '30～34歳', '35～39歳', '40～44歳', '45～49歳', '50～54歳']]
age_df = age_df.rename(columns={'15～19歳': '15-19',
                                '20～24歳': '20-24',
                                '25～29歳': '25-29',
                                '30～34歳': '30-34',
                                '35～39歳': '35-39',
                                '40～44歳': '40-44',
                                '45～49歳': '45-49',
                                '50～54歳': '50-54'})

#### Pivot the DAV table so that index is the regions

In [9]:
#dav_df = dav_df.pivot_table(values=['unique_logged_in_dav', 'unique_visitor_dav', 'total'],
#                            index='region_name',
#                            columns='date')

table = dav_df.pivot_table(values=['unique_logged_in_dav', 'unique_visitor_dav', 'total'],
                            index='region_name',
                            aggfunc=np.mean)



#### Merge all dataframes together so that we have data by region in one dataframe

In [10]:
df = table.merge(cce_df, how='outer', left_index=True, right_index=True)
df = df.merge(pop_df, how='outer', left_index=True, right_index=True)
df = df.merge(age_df, how='outer', left_index=True, right_index=True)
df = df.rename(columns={'total': 'Average Total DAV',
               'unique_logged_in_dav': 'Average Unique Logged in DAV',
               'unique_visitor_dav': 'Average Unique Visitor DAV',
               'Casual': 'Casual Users Proportion',
               'Core': 'Core Users Proportion',
               'Emerging': 'Emerging Users Proportion',})

#df['Internet Population'] = df['General Populations'] * df['Internet Penetration (%)'] / 100
#df['YT penetration (of internet)'] = df['Average Total DAV'] / df['Internet Population']
#df['YT penetration (of population)'] = df['Average Total DAV'] / df['General Populations']
df['15-34'] = df['15-19'] + df['20-24'] + df['25-29'] + df['30-34']
df['HYU DAV'] = df['Average Total DAV'] * df['Core Users Proportion']


Unnamed: 0,Average Total DAV,Average Unique Logged in DAV,Average Unique Visitor DAV,Casual Users Proportion,Core Users Proportion,Emerging Users Proportion,General Populations,Internet Penetration (%),15-19,20-24,25-29,30-34,35-39,40-44,45-49,50-54,15-34,HYU DAV
Aichi Prefecture,3213150.0,1929695.0,1283456.0,0.451036,0.249935,0.29903,7525.0,82.5,374.0,420.0,414.0,451.0,484.0,567.0,610.0,507.0,1659.0,803078.0
Akita Prefecture,224450.8,134652.3,89798.5,0.413382,0.294916,0.291702,996.0,67.1,40.0,30.0,33.0,42.0,51.0,61.0,62.0,59.0,145.0,66194.15
Aomori Prefecture,308340.2,187870.5,120469.7,0.40196,0.307982,0.290058,1278.0,70.9,58.0,48.0,48.0,59.0,69.0,82.0,86.0,83.0,213.0,94963.2
Chiba Prefecture,1935724.0,1168856.0,766867.8,0.442371,0.263618,0.294011,6246.0,79.9,289.0,323.0,310.0,344.0,387.0,465.0,513.0,428.0,1266.0,510292.6
Ehime Prefecture,368960.5,218740.8,150219.7,0.420804,0.288631,0.290564,1364.0,73.9,63.0,52.0,54.0,65.0,74.0,91.0,93.0,83.0,234.0,106493.6
Fukui Prefecture,231908.8,132695.8,99213.0,0.428851,0.285833,0.285316,779.0,73.5,39.0,33.0,34.0,39.0,43.0,52.0,54.0,48.0,145.0,66287.27
Fukuoka Prefecture,1733438.0,1058270.0,675168.5,0.43084,0.262643,0.306517,5107.0,77.8,241.0,275.0,251.0,285.0,321.0,363.0,365.0,311.0,1052.0,455275.1
Fukushima Prefecture,484385.2,295443.5,188941.7,0.403094,0.309355,0.287551,1882.0,72.3,89.0,73.0,80.0,94.0,105.0,122.0,124.0,117.0,336.0,149847.1
Gifu Prefecture,584063.3,341482.0,242581.3,0.424776,0.279684,0.295539,2008.0,74.9,101.0,92.0,86.0,98.0,112.0,139.0,148.0,128.0,377.0,163353.4
Gunma Prefecture,578506.5,346561.7,231944.8,0.421691,0.288399,0.28991,1960.0,78.0,97.0,90.0,88.0,99.0,113.0,139.0,148.0,127.0,374.0,166840.6


#### Load in the market match results

In [21]:
mm_df = pd.read_csv('MM_results.csv')

Unnamed: 0.1,Unnamed: 0,region_name,BestControl,RelativeDistance,Correlation,Length,MatchingStartDate,MatchingEndDate,rank,davs_data$region_name
0,1,1,35,0.454112,0.935524,6,2019-09-27,2019-10-02,1,Aichi Prefecture
1,2,1,4,0.705333,0.954815,6,2019-09-27,2019-10-02,2,Aichi Prefecture
2,3,1,12,0.767354,0.996592,6,2019-09-27,2019-10-02,3,Aichi Prefecture
3,4,1,7,0.820624,0.949414,6,2019-09-27,2019-10-02,4,Aichi Prefecture
4,5,1,13,0.852046,0.984824,6,2019-09-27,2019-10-02,5,Aichi Prefecture


#### Get the median correlations for each prefecture

In [22]:
mm_table = mm_df.pivot_table(values=['Correlation'], index='davs_data$region_name', aggfunc=np.median)

Unnamed: 0_level_0,Correlation
davs_data$region_name,Unnamed: 1_level_1
Aichi Prefecture,0.986955
Akita Prefecture,0.983178
Aomori Prefecture,0.974664
Chiba Prefecture,0.963482
Ehime Prefecture,0.968459


In [31]:
df = df.merge(mm_table, how='outer', left_index=True, right_index=True)


#### Get final ranking

Exposed regions should be within 

In [37]:
#df.to_csv('results.csv', encoding='utf-8-sig')