In [None]:
import pandas as pd
import random
import numpy as np
from operator import itemgetter
from collections import Counter
import datetime
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')
import gc
from scipy import stats

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import experiment_prep as exp

In [None]:
def clean_for_source(row):
    if row['first_stream_source'] =='album':
        return row['track_album']
    elif row['first_stream_source'] =='others_playlist':
        return None
    
def clean_for_source_pl(row):
    if row['first_stream_source'] =='album':
        return None
    elif row['first_stream_source'] =='others_playlist':
        return row['track_playlist']
    
def set_cohort_group(df,mode='week'):
    '''
    Identify cohort group based on date of first streaming
    '''
    df['id'] = df['customer_id']
    df = df.set_index('id')
    # customer first & last day
    if mode=='week':
        df['first_listen'] = df.groupby(level=0)['week'].min()
        df['last_listen'] = df.groupby(level=0)['week'].max()
    elif mode=='day':
        df['first_listen'] = df.groupby(level=0)['day'].min()
        df['last_listen'] = df.groupby(level=0)['day'].max()
    df['cohort_group'] = df['first_listen']
    df['cohort_group'] = df['cohort_group'].astype('str')
    df = df.reset_index()
    return df

---

#### checking evaluation period (daily/weekly)

In [None]:
def get_stream_days(df):
    days = df.groupby('customer_id')['day'].nunique().to_dict()
    df['stream_days'] = df.customer_id.map(days)
    return df
df = get_stream_days(df)
df.columns

In [None]:
df.drop_duplicates(subset='customer_id')[df.stream_days>=70].customer_id.nunique()

In [None]:
df.drop_duplicates(subset='customer_id')[df.stream_days>=70].customer_id.nunique()

In [None]:
df['stream_days_bin'] = pd.cut(df.stream_days,[-1,69,100],labels=['weekly', 'daily'])

In [None]:
df.drop_duplicates(subset='customer_id')['stream_days_bin'].value_counts().plot.bar(color='darkblue', title='customers streaming frequency')

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(df.drop_duplicates(subset='customer_id').stream_days)
plt.title('distribution of customers stream days')

In [None]:
gc.collect()

In [None]:
TRACK_ID = 'f72fa60c8d9848a393d8ac4bbaa866ef'

In [None]:
df = pd.read_pickle('/project/samples/new/sample_mixed_105k_cleaned2.pickle')

### within discovery channels

#### preparing the data

getting non-listeners

In [None]:
# create separate df for track streams
df_track = df[df.track_id==TRACK_ID]

In [None]:
listeners = df[df.track_id==TRACK_ID].customer_id.unique()
df_non = df[~df.customer_id.isin(df_track.customer_id)]

In [None]:
df_non.customer_id.nunique()

In [None]:
gc.collect()

In [None]:
df_non = pd.read_pickle('data/exp_non_listeners.pickle')

In [None]:
df_non.customer_id.nunique()

In [None]:
df_non.to_pickle('data/exp_non_listeners.pickle')

In [None]:
# get customers with full week prior data
def get_stream_weeks(df):
    weeks = df.groupby('customer_id')['week'].nunique()
    df['stream_weeks'] = df.customer_id.map(weeks)
    return df

def get_stream_unique_weeks(df):
    weeks = df.groupby('customer_id')['week'].unique()
    df['stream_weeks_list'] = df.customer_id.map(weeks)
    return df

df_non = get_stream_weeks(df_non)
df_non[df_non.stream_weeks==10].customer_id.nunique()

In [None]:
df_non = get_stream_unique_weeks(df_non)

In [None]:
def flag_eligibility(r):
    # customers to be included should have streaming data before and after event (week 16)
    post = [w for w in r if w in[17,18,19,20]] 
    pre = [w for w in r if w in[12,13,14,15]] 
    if len(post)>0 and len(pre)>0:
        return 1
    else:
        return 0

In [None]:
# at least have 3 weeks listening history covered including week 16
nonlisteners_in = df_non[(df_non.week==16) & (df_non.stream_weeks>2)].customer_id.unique()
df_non = df_non[df_non.customer_id.isin(nonlisteners_in)]
df_non.customer_id.nunique()

In [None]:
# checking control groups size
df_non[(df_non.album_name.isin(album.track_album))].customer_id.nunique(),df_non[(df_non.playlist_id.isin(playlist.track_playlist))].customer_id.nunique()

In [None]:
# filter to only use customers with data across pre-post event
#df_non['include_as_control'] = df_non.stream_weeks_list.apply(flag_eligibility)
df_non = df_non[df_non.include_as_control==1]

# rechecking control groups size
df_non[(df_non.album_name.isin(album.track_album))].customer_id.nunique(),df_non[(df_non.playlist_id.isin(playlist.track_playlist))].customer_id.nunique()

In [None]:
df_non.to_pickle('data/exp_non_listeners_filtered.pickle')

In [None]:
df_non = pd.read_pickle('data/exp_non_listeners_filtered.pickle')
df_non.columns

In [None]:
df_non[df_non.include_as_control==1].customer_id.nunique()

In [None]:
gc.collect()

getting treatment group

In [None]:
# get customer population for album & playlist cohort
in_scope = df_track[df_track.first_stream_source.isin(['album','others_playlist'])][['customer_id','album_name','playlist_id']]
in_scope.head()

# slice the customers
df = df[df.customer_id.isin(in_scope.customer_id)]
# limit to albums & playlists population related to see you again track
df = df[(df.album_name.isin(in_scope.album_name)) | (df.playlist_id.isin(in_scope.playlist_id))]

In [None]:
# to get correct cohort group, use df track then marged to main df on customer_id
df_track['id'] = df_track['customer_id']
df_track = df_track.set_index('id')
df_track[['first_listen','first_listen_logtime']] = df_track.groupby(level=0)[['week','logtime']].min()
df_track['cohort_group'] = df_track['first_listen']
df_track['cohort_group'] = df_track['cohort_group'].astype('str')
df_track = df_track.reset_index()

In [None]:
df_track = df_track[['customer_id','cohort_group','first_listen_logtime','album_name','playlist_id','first_stream_source']]
df_track = df_track.drop_duplicates(subset='customer_id')

df_track.rename(columns={'album_name':'track_album', 'playlist_id':'track_playlist'},inplace=True)
df_track.head()

In [None]:
df_track[['track_playlist']] = df_track.apply(lambda row: clean_for_source_pl(row),axis=1)
df_track[['track_album']] = df_track.apply(lambda row: clean_for_source(row),axis=1)

In [None]:
# only get cohort group with significant user base
df_track = df_track[df_track.cohort_group.isin(['15','16','17'])]

In [None]:
df = df[df.customer_id.isin(df_track.customer_id)]
df = df.merge(df_track, on=['customer_id'],how='left')
df.head()

In [None]:
gc.collect()

In [None]:
df = df.drop(['first_stream_source_y'],axis=1)
df.rename(columns={'first_stream_source_x':'first_stream_source'},inplace=True)
df.head()

In [None]:
# save for reuse later
df.to_pickle('data/experiment_treatment_df_full_history.pickle')

In [None]:
gc.collect()

In [None]:
# split into album & playlist group

# get album & playlist customer
album_cust = df_track[df_track.first_stream_source=='album'].customer_id.unique()
pl_cust = df_track[df_track.first_stream_source=='others_playlist'].customer_id.unique()

# split the cohort & pick only cohort week 16
album = df[(df.customer_id.isin(album_cust)) & (df.cohort_group=='16')]
playlist = df[df.customer_id.isin(pl_cust) & (df.cohort_group=='16')]

album.customer_id.nunique(),playlist.customer_id.nunique()

#### feature engineering & data transformation

control groups

In [None]:
albums= df.track_album.dropna().unique().tolist()
playlists= df.track_playlist.dropna().unique().tolist()
playlists

In [None]:
# SPLIT CONTROL GROUP FOR PLAYLIST & ALBUM SEGMENT

# first get user
album_control_users = df_non[(df_non.album_name.isin(albums))].customer_id.unique()
pl_control_users = df_non[(df_non.playlist_id.isin(playlists))].customer_id.unique()

df_control_playlist = df_non[(~df_non.customer_id.isin(album_control_users))]

df_control_playlist.customer_id.nunique()


In [None]:
df_control_playlist = exp.get_avg_stream_count(df_control_playlist,mode='week')

In [None]:
gc.collect()

In [None]:
df_non.customer_id.nunique()

In [None]:
pl_control_sample = resample(df_control_playlist.drop_duplicates(subset='customer_id').customer_id,n_samples=11000, replace=False)
df_control_playlist = df_control_playlist[df_control_playlist.customer_id.isin(pl_control_sample)]

In [None]:
# SPLIT CONTROL GROUP FOR PLAYLIST & ALBUM SEGMENT

# first get user
album_control_users = df_non[(df_non.album_name.isin(album.track_album))].customer_id.unique()
pl_control_users = df_non[(df_non.playlist_id.isin(playlist.track_playlist))].customer_id.unique()

df_control_album =  df_non[df_non.customer_id.isin(album_control_users)]
df_control_playlist = df_non[(df_non.customer_id.isin(df_exp_control.customer_id)) & (~df_non.customer_id.isin(album_control_users))]

df_control_album.customer_id.nunique(), df_control_playlist.customer_id.nunique()

In [None]:
# get weekly stream count
df_control_playlist = exp.get_avg_stream_count(df_control_playlist,mode='week')
df_control_album = exp.get_avg_stream_count(df_control_album,mode='week')

In [None]:
df_control_playlist['group'] = 'playlist'
df_control_album['group'] = 'album'

In [None]:
pl_control_sample = resample(df_control_playlist.drop_duplicates(subset='customer_id').customer_id,n_samples=11000, replace=False)

In [None]:
df_control_playlist = df_control_playlist[df_control_playlist.customer_id.isin(pl_control_sample)]

In [None]:
df_control_playlist.customer_id.nunique()

In [None]:
df_non = pd.concat([df_control_playlist,df_control_album],ignore_index=True)

In [None]:
df_non.to_pickle('/project/data/exp_non_filtered_final_inchart.pickle')

In [None]:
df_control_pre,df_control_post = get_pre_post_non_listeners_df(df_non)

In [None]:
df_non.head()

In [None]:
df_non.customer_id.nunique()

In [None]:
week16_active = df_non[df_non.week==16].customer_id.unique()
len(week16_active)

In [None]:
df_control_pre.head()

In [None]:
gc.collect()

In [None]:
df_exp_control = pd.concat([df_control_pre,df_control_post],ignore_index=True).drop_duplicates(subset=['customer_id','week'])
output = '/project/exp_output/exp_3007_nonlisteners.csv'
df_exp_control.to_csv(output,index=False)

In [None]:
df_exp_control.to_pickle('/project/exp_output/exp_3007_nonlisteners.pickle')

In [None]:
df_exp_control = pd.read_csv('/project/exp_output/exp_3007_nonlisteners.csv',index_col=False)

In [None]:
# make sure no overlap
df_exp_control[df_exp_control.customer_id.isin(album.customer_id)],df_exp_control[df_exp_control.customer_id.isin(playlist.customer_id)]

In [None]:
df_exp_control.head()

In [None]:
gc.collect()

treatment groups

In [None]:
# get weekly stream count
playlist = exp.get_avg_stream_count(playlist,mode='week')
album = exp.get_avg_stream_count(album,mode='week')

In [None]:
playlist.customer_id.nunique(), album.customer_id.nunique()

In [None]:
bucket = [playlist,album]
df_pre,df_post = exp.get_pre_post_df(df,bucket)

In [None]:
def get_pre_post_non_listeners_df(df):
    # split bucket into pre and post
    df_pre,df_post = split_pre_post_nonlisteners(df,week=16)
    # for each, set the observation period
    df_pre = set_observation_period_cg(df,period=7,mode='day')
    df_post = set_observation_period_cg(df,period=7,mode='day')
    # get stats
    df_pre = exp.get_prior_stats(df_pre)
    df_post = exp.get_post_stats(df_post)
    return df_pre,df_post

def split_pre_post_nonlisteners(df,week=16):
    '''
    split pre & post track based on week chosen
    '''
    dataframe_after = df[df.week>week]
    dataframe_after['is_post'] = 1
    dataframe_before = df[df.week<=week]
    dataframe_before['is_post'] = 0
    return dataframe_before,dataframe_after

def set_observation_period_cg(df,period=7,mode='day'):
    '''
    for control goup (nonlisteners)
    '''
    df['next_period'] = df[mode].astype('int') + period
    df['prev_period'] = df[mode].astype('int') - period
    next_Nday = df.groupby('customer_id')['next_period'].min().to_dict()
    prev_Nday = df.groupby('customer_id')['prev_period'].min().to_dict()
    df['cut_time_lower'] = df['customer_id'].map(prev_Nday)
    df['cut_time_upper'] = df['customer_id'].map(next_Nday)
    return df

In [None]:
gc.collect()

In [None]:
df_pre.head()

In [None]:
df_pre2.head()

In [None]:
df_exp['group'].unique()

In [None]:
df_pre.to_pickle('/project/exp_output/exp_pre_full.pickle')
df_post.to_pickle('/project/exp_output/exp_post_full.pickle')

In [None]:
df_exp = pd.concat([df_pre,df_post],ignore_index=True).drop_duplicates(subset=['customer_id','week'])
output = '/project/exp_output/exp_3007.csv'
df_exp.to_csv(output,index=False)

In [None]:
df_exp.head()

In [None]:
gc.collect()

#### data transformation

now we have df_exp consist of all streaming history of each customer within their respected discovery channels. To do timeseries analysis, we need to slice the cohort again and then transform the data.

In [None]:
# control groups
t_playlist_control = df_exp_control[(df_exp_control.customer_id.isin(df_control_playlist.customer_id))].groupby('week')['weekly_stream_count'].mean()
t_album_control = df_exp_control[(df_exp_control.customer_id.isin(df_control_album.customer_id))].groupby('week')['weekly_stream_count'].mean()

# treatment groups
t_playlist = df_exp[(df_exp.customer_id.isin(playlist.customer_id))].groupby('week')['weekly_stream_count'].mean()
t_album = df_exp[(df_exp.customer_id.isin(album.customer_id))].groupby('week')['weekly_stream_count'].mean()

df_t = pd.concat([t_album,t_playlist,t_playlist_control, t_album_control],axis=1)
df_t.columns = ['album','playlist','playlist_control', 'album_control']
df_t = df_t.reset_index()


In [None]:
df_t['treatment_diff'] = df_t.playlist - df_t.album
df_t['playlist_effect'] = df_t.playlist - df_t.playlist_control
df_t['album_effect'] = df_t.album - df_t.album_control

In [None]:
df_t

#### viz - first look

In [None]:
sns.set_style('white')
plt.figure(figsize=(12,7))
ax = plt.subplot()
plt.plot( 'week', 'album', data=df_t,  color='darkblue', linewidth=2, label='treatment group')
plt.plot( 'week', 'album_control', data=df_t, linestyle='-.', color='orange', linewidth=2, label='control group')
plt.xlabel('week',fontsize='large')
plt.ylabel('average stream count',fontsize='large')
plt.legend(fontsize='large',loc=4)
ax.set_ylim([0,30])
ax.set_xticks(np.arange(12,23))
plt.title('Album Segments Trend Comparison',fontsize=14)
plt.savefig('/project/fig/album_trend_plots.png',dpi=150)
plt.show()

In [None]:
sns.set_style('white')
plt.figure(figsize=(12,7))
ax = plt.subplot()
plt.plot( 'week', 'playlist', data=df_t,  color='darkblue', linewidth=2,label='treatment group')
plt.plot( 'week', 'playlist_control', data=df_t, linestyle='-.', color='orange', linewidth=2, label='control group')
plt.xlabel('week',fontsize='large')
plt.legend(fontsize='large',loc=4)
ax.set_ylim([0,30])
ax.set_xticks(np.arange(12,23))
plt.ylabel('average stream count',fontsize='large')
plt.title('Playlist Segments Trend Comparison',fontsize=14)
plt.savefig('/project/fig/playlist_trend_plots.png',dpi=150)
plt.show()

In [None]:
sns.set_style('white')
plt.figure(figsize=(12,7))
ax = plt.subplot()
plt.plot( 'week', 'album', data=df_t,  color="#3498db", linewidth=2, label='album')
plt.plot( 'week', 'playlist', data=df_t, linestyle='--', color="#3498db", linewidth=2, label='playlist')
plt.xlabel('week',fontsize='large')
plt.ylabel('average stream count',fontsize='large')
plt.legend(fontsize='large',loc=4)
ax.set_ylim([0,30])
ax.set_xticks(np.arange(12,23))
plt.title('Treatment Groups Trend Comparison',fontsize=14)
plt.savefig('/project/fig/treatments_trend_plots.png',dpi=150)
plt.show()

#### BETTER PLOTS

In [None]:
plt.figure()
fig, ax = plt.subplots(2, 1, figsize=(14,12))
ax[0].plot( 'week', 'album', data=df_t,  color='darkblue', linewidth=2,label='treatment group')
ax[0].plot( 'week', 'album_control', data=df_t, linestyle='--', color='orange', linewidth=2, label='control group')
ax[1].plot('week', 'album_growth_prcnt', data=df_t,  color="darkblue", linewidth=2, label='album',marker='o')
ax[1].plot('week', 'album_control_growth_prcnt', data=df_t, linestyle='--', color="orange", linewidth=2, label='playlist',marker='o')

# label, xticks, limit, legend
ax[0].set_ylabel('average stream count',fontsize=12)
ax[1].set_ylabel('rate of change',fontsize=12)
ax[1].set_xlabel('week',fontsize=12)
ax[0].set_xticks(np.arange(12,23))
ax[1].set_xticks(np.arange(12,23))
ax[0].set_ylim([0,30])
ax[1].set_ylim([-1,1])
plt.setp(ax[0].get_xticklabels(), visible=False)
ax[0].tick_params(labelsize=12)
ax[1].tick_params(labelsize=12)
ax[0].legend(fontsize=13,loc=3)


fig.tight_layout()
fig.savefig('/project/fig/albums_trend_plots_combined.png',dpi=150)


In [None]:
plt.figure()
fig, ax = plt.subplots(2, 1, figsize=(14,12))
ax[0].plot( 'week', 'playlist', data=df_t,  color='darkblue', linewidth=2,label='treatment group')
ax[0].plot( 'week', 'playlist_control', data=df_t, linestyle='--', color='orange', linewidth=2, label='control group')
ax[1].plot('week', 'playlist_growth_prcnt', data=df_t,  color="darkblue", linewidth=2, label='album',marker='o')
ax[1].plot('week', 'playlist_control_growth_prcnt', data=df_t, linestyle='--', color="orange", linewidth=2, label='playlist',marker='o')

# label, xticks, limit, legend
ax[0].set_ylabel('average stream count',fontsize=12)
ax[1].set_ylabel('rate of change',fontsize=12)
ax[1].set_xlabel('week',fontsize=12)
ax[0].set_xticks(np.arange(12,23))
ax[1].set_xticks(np.arange(12,23))
ax[0].set_ylim([0,30])
ax[1].set_ylim([-1,1])
plt.setp(ax[0].get_xticklabels(), visible=False)
ax[0].tick_params(labelsize=12)
ax[1].tick_params(labelsize=12)
ax[0].legend(fontsize=13,loc=3)


fig.tight_layout()
fig.savefig('/project/fig/playlists_trend_plots_combined.png',dpi=150)


##### using alternative control groups
based on cohort after the chosen one

In [None]:
df_alt_control = pd.read_pickle('/project/exp_output/cohort_18_onwards_for_control.pickle')
gc.collect()

In [None]:
df_alt_control[(df_alt_control.group=='playlist')].groupby('week')['weekly_stream_count'].mean()

In [None]:
df_t.columns

In [None]:
df_t.drop(['playlist_control_alt',
       'album_control_alt'],axis=1,inplace=True)

In [None]:
t_playlist_control = df_alt_control[(df_alt_control.group=='playlist')].groupby('week')['weekly_stream_count'].mean()
t_album_control = df_alt_control[(df_alt_control.group=='album')].groupby('week')['weekly_stream_count'].mean()
t_album_control

In [None]:
t_playlist_control.reset_index()

In [None]:
t_album_control.reset_index()

In [None]:
# append to df_t
t_album_control = df_alt_control[(df_alt_control.group=='album')].groupby('week')['weekly_stream_count'].mean().reset_index()

df_t = df_t.merge(t_album_control, on='week', how='left')
df_t.rename(columns={ 'weekly_stream_count': "album_control_alt"},inplace=True)
df_t


In [None]:
df_t['album_effect_alt'] = df_t.album - df_t.album_control_alt

In [None]:
df_t = get_weekly_growth(df_t, 'album_control_alt')
df_t = df_t.iloc[:11]

In [None]:
df_t.fillna(0,inplace=True)
df_t

In [None]:
plt.figure()
fig, ax = plt.subplots(2, 1, figsize=(14,12))
ax[0].plot( 'week', 'album', data=df_t,  color='darkblue', linewidth=2,label='treatment group')
ax[0].plot( 'week', 'album_control_alt', data=df_t, linestyle='--', color='orange', linewidth=2, label='control group')
ax[1].plot('week', 'album_growth_prcnt', data=df_t,  color="darkblue", linewidth=2, label='album',marker='o')
ax[1].plot('week', 'album_control_alt_growth_prcnt', data=df_t, linestyle='--', color="orange", linewidth=2, label='playlist',marker='o')

# label, xticks, limit, legend
ax[0].set_ylabel('average stream count',fontsize=12)
ax[1].set_ylabel('rate of change',fontsize=12)
ax[1].set_xlabel('week',fontsize=12)
ax[0].set_xticks(np.arange(12,23))
ax[1].set_xticks(np.arange(12,23))
ax[0].set_ylim([0,30])
ax[1].set_ylim([-1,1])
plt.setp(ax[0].get_xticklabels(), visible=False)
ax[0].tick_params(labelsize=12)
ax[1].tick_params(labelsize=12)
ax[0].legend(fontsize=13,loc=3)


fig.tight_layout()
fig.savefig('/project/fig/album_alt_trend_plots_combined.png',dpi=150)


---

In [None]:
plt.figure()
fig, ax = plt.subplots(2, 1, figsize=(14,12))
ax[0].plot( 'week', 'album', data=df_t,  color="#3498db", linewidth=2, label='album')
ax[0].plot( 'week', 'playlist', data=df_t, linestyle='--', color="#3498db", linewidth=2, label='playlist')
ax[1].plot('week', 'album_growth_prcnt', data=df_t,  color="#3498db", linewidth=2, label='album',marker='o')
plt.plot('week', 'playlist_growth_prcnt', data=df_t, linestyle='--', color="#3498db", linewidth=2, label='playlist',marker='o')

# label, xticks, limit, legend
ax[0].set_ylabel('average stream count',fontsize=12)
ax[1].set_ylabel('rate of change',fontsize=12)
ax[1].set_xlabel('week',fontsize=12)
ax[0].set_xticks(np.arange(12,23))
ax[1].set_xticks(np.arange(12,23))
ax[0].set_ylim([0,30])
ax[1].set_ylim([-1,1])
plt.setp(ax[0].get_xticklabels(), visible=False)
ax[0].tick_params(labelsize=12)
ax[1].tick_params(labelsize=12)
ax[0].legend(fontsize=13,loc=3)


fig.tight_layout()
fig.savefig('/project/fig/treatments_trend_plots_combined.png',dpi=150)


from visual inspection, the trends (more importantly until week 16, which is our track-discovery week) looks paralel. Now lets do t-test to confirm

#### statistical analysis with t-test

In [None]:
# calculate week-to-week changes in each cohort
def get_weekly_growth(df_t, col_name):
    unit_col_name = str(col_name + '_growth_unit')
    prcnt_col_name = str(col_name + '_growth_prcnt')
    df_t[unit_col_name] = df_t[col_name].diff()
    df_t[prcnt_col_name] = df_t[col_name].pct_change()
    return df_t

In [None]:
df_t = get_weekly_growth(df_t, 'album')
df_t = get_weekly_growth(df_t, 'playlist')
df_t = get_weekly_growth(df_t, 'album_control')
df_t = get_weekly_growth(df_t, 'playlist_control')
df_t

In [None]:
df_t.to_csv('/project/exp_output/df_t.csv',index=False)

In [None]:
df_t = pd.read_csv('/project/exp_output/df_t.csv',index_col=False)
df_t

In [None]:
df_t.fillna(0,inplace=True)

In [None]:
gc.collect()

In [None]:
df_t.to_csv('/project/exp_output/df_t_new_2.csv',index=False)

In [None]:
df_t

**we only have week 12-15 as pre-event period. So we're gonna run t-test of average growth during 3 pre-event periods (df=2, alpha =0.05).**

between playlist treatment - control

In [None]:
df_t[['week','playlist_growth_prcnt','playlist_control_growth_prcnt']].iloc[1:4]

In [None]:
treatment_growth = df_t.playlist_growth_prcnt.tolist()[1:4]
control_growth = df_t.playlist_control_growth_prcnt.tolist()[1:4]
stats.ttest_ind(treatment_growth,control_growth,equal_var = False)

between album treatment - control

In [None]:
df_t[['week','album_growth_prcnt','album_control_alt_growth_prcnt']].iloc[1:4]

In [None]:
treatment_growth = df_t.album_growth_prcnt.tolist()[1:4]
control_growth = df_t.album_control_alt_growth_prcnt.tolist()[1:4]
stats.ttest_ind(treatment_growth,control_growth)

between album & playlist

In [None]:
df_t.playlist_growth_prcnt.tolist()[1:4]

In [None]:
treatment_pl = df_t.playlist_growth_prcnt.tolist()[1:4]
treatment_al = df_t.album_growth_prcnt.tolist()[1:4]
stats.ttest_ind(treatment_pl,treatment_al,equal_var = False)

** based on the t-test result, we can conclude the average growth between each pairs are equal (accept H0) **

### track engagement / repeat plays -> irrelevant for parallel trend checking

for this metric we can only compare treatment effects (no control group of non-listeners)

In [None]:
TRACK_ID = 'f72fa60c8d9848a393d8ac4bbaa866ef'

In [None]:
df = pd.read_pickle('/project/samples/new/sample_mixed_105k_cleaned2.pickle')

In [None]:
# use only track population
df = df[df.track_id==TRACK_ID]

In [None]:
gc.collect()

#### data preparation

In [None]:
df = set_cohort_group(df)

In [None]:
# pick only cohort week 16
df = df[df.cohort_group=='16']

In [None]:
# get album & playlist customer
album_cust = df[df.first_stream_source=='album'].customer_id.unique()
pl_cust = df[df.first_stream_source=='others_playlist'].customer_id.unique()

In [None]:
len(album_cust), len(pl_cust)

In [None]:
album = df[df.customer_id.isin(album_cust)]
playlist = df[df.customer_id.isin(pl_cust)]

#### feature engineering

In [None]:
# get weekly stream count
playlist = exp.get_avg_stream_count(playlist,mode='week')
album = exp.get_avg_stream_count(album,mode='week')

In [None]:
bucket = [playlist,album]
df_pre,df_post = exp.get_pre_post_df(df,bucket)

In [None]:
gc.collect()

In [None]:
df_exp = pd.concat([df_pre,df_post],ignore_index=True).drop_duplicates(subset=['customer_id','week'])
output = '/project/exp_output/exp_track_engagement_3107.csv'
df_exp.to_csv(output,index=False)

#### data transformation

In [None]:
# treatment groups
t_playlist = df_exp[(df_exp.customer_id.isin(playlist.customer_id))].groupby('week')['weekly_stream_count'].mean()
t_album = df_exp[(df_exp.customer_id.isin(album.customer_id))].groupby('week')['weekly_stream_count'].mean()

df_t = pd.concat([t_album,t_playlist],axis=1)
df_t.columns = ['album','playlist']
df_t = df_t.reset_index()


In [None]:
df_t['treatment_diff'] = df_t.playlist - df_t.album
df_t

#### viz

In [None]:
sns.set_style('white')
plt.figure(figsize=(12,7))
plt.plot( 'week', 'album', data=df_t,  color="#3498db", linewidth=2, label='album')
plt.plot( 'week', 'playlist', data=df_t, linestyle='--', color="#3498db", linewidth=2, label='playlist')
plt.xlabel('week',fontsize='large')
plt.ylabel('average stream count',fontsize='large')
plt.legend(fontsize='large',loc=1)
plt.title('Post-discovery Treatment Comparison',fontsize=14)
plt.savefig('/project/fig/treatments_trend_plots_track.png',dpi=150)
plt.show()

looks like the effect only holds until week 17. After that it's showing consistent result with cohort analysis - track engagement rate. 
Based on this we can pick just 1-2 weeks for post-discovery period for DID estimation. Beyond that we wont get meaningful insights since the effect would have gone.

In [None]:
df_t.to_csv('/project/exp_output/df_t_track_engagement.csv',index=False)

Noise investigation for playlist control group

In [None]:
df_exp_control

In [None]:
t_playlist_control = df_exp_control[(df_exp_control.group=='playlist')].groupby('week')['weekly_stream_count'].mean()
t_playlist_control

In [None]:
# make sure no overlap with playlist customers
df_exp_control[(df_exp_control.group=='playlist')]

In [None]:
playlist['treated'] = 1

In [None]:
playlist.to_pickle('/project/exp_output/paylist_treatment_did_all_vars.pickle')

In [None]:
album['treated'] = 1

In [None]:
album.to_pickle('/project/exp_output/album_treatment_did_all_vars.pickle')

In [None]:
album_control_alt = df_alt_control[(df_alt_control.group=='album')]

In [None]:
album_control_alt['treated'] = 0

In [None]:
album_control_alt.to_pickle('/project/exp_output/album_control_did_all_vars.pickle')

In [None]:
playlist_control = df_exp_control[df_exp_control.group=='playlist']

In [None]:
playlist_control['treated'] = 0

In [None]:
playlist_control.to_pickle('/project/exp_output/playlist_control_did_all_vars.pickle')

In [None]:
gc.collect()

In [None]:
album_control_alt.drop_duplicates(subset='customer_id').week.value_counts()

In [None]:
df_non = pd.read_pickle('/project/data/exp_non_filtered_final_inchart.pickle')

In [None]:
df_non.drop_duplicates(subset='customer_id').group.value_counts()

In [None]:
gc.collect()

In [None]:
df_non = df_non[df_non.group=='playlist']

In [None]:
df_non.week.value_counts().sort_values()

In [None]:
df_non.drop_duplicates(subset=['customer_id','week']).week.value_counts().sort_values()

In [None]:
df_non.

In [None]:
sample = resample(df_non[df_non.week==16].drop_duplicates(subset='customer_id').customer_id,n_samples=10700, replace=False)

In [None]:
df_non = df_non[df_non.customer_id.isin(sample)]
df_non.drop_duplicates(subset=['customer_id','week']).week.value_counts().sort_values()

In [None]:
df_non.customer_id.nunique()

In [None]:
[x/y for x, y in zip(t_s, n_c)]

In [None]:
# playlist treatment group
treatment = pd.read_pickle('/project/exp_output/paylist_treatment_did_all_vars.pickle')

In [None]:
treatment.head()

In [None]:
df_non = df_non.drop(['weekly_stream_count'],axis=1)

In [None]:
gc.collect()

In [None]:
df_non.columns

In [None]:
def get_avg_stream_count(df,mode='week'):
    # either for before of after, with day and week as mode
    # input either post or pre df
    count = df.groupby(['customer_id',mode])['index'].count().reset_index()
    col_name = str(mode+'ly_stream_count')
    count.rename(columns={'index':col_name},inplace=True)
    df = df.merge(count, on=['customer_id',mode], how='left')
    return df
df_non =get_avg_stream_count(df_non,mode='week')

In [None]:
# control groups
t_playlist_control = df_control_playlist.drop_duplicates(subset=['customer_id','week']).groupby('week')['weekly_stream_count'].mean()

# treatment groups
t_playlist = treatment.drop_duplicates(subset=['customer_id','week']).groupby('week')['weekly_stream_count'].mean()

df_t = pd.concat([t_playlist,t_playlist_control],axis=1)
df_t.columns = ['playlist','playlist_control']
df_t = df_t.reset_index()
df_t[['week','playlist_control']]

In [None]:
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
 
# Make data

channel = [15.31,15.20,14.25,14.51,13.81,15.42]
channels = [27.24, 26.34,25.56,26.42,25.63,26.78]
others = [round(a-b,2) for a,b in zip(channels,channel)]
others

In [None]:
data = pd.DataFrame({ 'c':channel, 'others':others }, index=range(1,7))



In [None]:
data

In [None]:
plt.figure(figsize=(8,8))
pal = ["#009086", "#002B3C"]
x=range(1,7)
y=[ channel,others]

plt.stackplot(x,y, labels=["customer's own discovery channel",'other streams within discovery channel lists'],colors=pal)
plt.legend(loc='upper left',fontsize='large')
plt.title('share of ')
plt.ylim(0,35)
plt.show()



In [None]:
#transform the data from raw data to percentage (fraction)
data_perc = data.divide(channels,axis=0)
 
# Make the plot
plt.figure(figsize=(8,8))
colors=pal
plt.stackplot(range(1,7),  data_perc["c"],  data_perc["others"],  labels=["customer's own discovery channel",'other streams within discovery channel lists'],colors=pal)
plt.legend(loc=4,fontsize=12)
plt.margins(0,0)
plt.title("Share of Stream Count within Playlist Customer's Discovery Channel",fontsize=13)
plt.ylabel('share (%)',fontsize=12)
plt.xlabel('week after track discovery',fontsize=12)
plt.show()


In [None]:
channel = [6.26,4.68,3.68,3.31,3.01,2.76]
channels = [20.62,20.20,19.85,21.27,21.36,21.6]
others = [round(a-b,2) for a,b in zip(channels,channel)]
others

In [None]:
data = pd.DataFrame({ 'c':channel, 'others':others }, index=range(1,7))


data_perc = data.divide(channels,axis=0)
 
# Make the plot
plt.figure(figsize=(8,8))
colors=pal
plt.stackplot(range(1,7),  data_perc["c"],  data_perc["others"],  labels=["customer's own discovery channel",'other streams within discovery channel lists'],colors=pal)
plt.legend(loc=4,fontsize=12)
plt.margins(0,0)
plt.title("Share of Stream Count within Album Customer's Discovery Channel",fontsize=13)
plt.ylabel('share (%)',fontsize=12)
plt.xlabel('week after track discovery',fontsize=12)
plt.show()


In [None]:

pal = ["#009086", "#002B3C"]
x=range(1,7)
y=[ channel,others]

plt.stackplot(x,y, labels=["customer's own discovery channel",'other streams within discovery channel lists'],colors=pal)
plt.legend(loc='upper left')
plt.show()

