In [None]:
import pandas as pd
import random
import numpy as np
from operator import itemgetter
from collections import Counter
import datetime
from scipy import interp
import warnings
warnings.filterwarnings('ignore')
import gc

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# COHORT ANALYSIS
def set_cohort_group(df,mode='week'):
    '''
    Identify cohort group based on date of first streaming
    '''
    df['id'] = df['customer_id']
    df = df.set_index('id')
    # customer first & last day
    if mode=='week':
        df['first_listen'] = df.groupby(level=0)['week'].min()
        df['last_listen'] = df.groupby(level=0)['week'].max()
    elif mode=='day':
        df['first_listen'] = df.groupby(level=0)['day'].min()
        df['last_listen'] = df.groupby(level=0)['day'].max()
    df['cohort_group'] = df['first_listen']
    df['cohort_group'] = df['cohort_group'].astype('str')
    df = df.reset_index()
    return df

def cohort_period(df):
    df['cohort_period'] = np.arange(len(df)) + 1
    return df

def get_cohort_table(df,mode='week'):
    if mode=='week':
        grouped = df.groupby(['cohort_group','week'])
    elif mode=='day':
        grouped = df.groupby(['cohort_group','date'])
    
    df['customer_id2'] = df.customer_id # duplicate variable as dummy for stream count
    cohort = grouped.agg({'customer_id': pd.Series.nunique,'customer_id2': pd.Series.count})
    # rename columns
    cohort.rename(columns = {'customer_id': 'total_customers','customer_id2': 'total_stream_count'},inplace=True)
    return cohort

def get_cohort(df):
    grouped = df.groupby(['cohort_group'])
    cohort = grouped.agg({'customer_id': pd.Series.nunique,'user_stream_count': pd.Series.unique})
    # rename columns
    cohort.rename(columns = {'customer_id': 'total_customers','user_stream_count': 'total_streams'},inplace=True)
    return cohort

def get_retention(cohorts):
    # reindex the DataFrame 
    cohorts.reset_index(inplace=True)
    cohorts.set_index(['cohort_group', 'cohort_period'], inplace=True)
    cohort_group_size = cohorts['total_customers'].groupby(level=0).first()
    user_retention = cohorts['total_customers'].unstack(0).divide(cohort_group_size, axis=1)
    return user_retention

def export_retention_chart(retention,show_plot=False,save=True):
    sns.set(style='white', font_scale=1)
    fig = plt.figure(figsize=(14, 6))
    plt.title('Cohorts Analysis: Listeners of "See You Again" Retention')
    chart = sns.heatmap(retention.T, mask=retention.T.isnull(), annot=True, fmt='.0%',cmap='Blues');
    chart.set_ylabel('Cohort Group (First Week)') 
    chart.set_xlabel('Cohort Period (Week After Finding the Track)') 
    if show_plot==True:
        plt.show()
    if save==True:
        fig.savefig('retention.png', dpi=150)
    
def cohort_analysis(df,show_plot=False,mode='week'):
    df = set_cohort_group(df,mode=mode)
    cohort = get_cohort_table(df,mode=mode)
    cohort = cohort.groupby(level=0).apply(cohort_period)
    ret = get_retention(cohort)
    export_retention_chart(ret,show_plot=show_plot)
    return ret, cohort 

def compare_cohort_retention(cohort_name_list,ret_list):
    # prepare viz
    sns.set(style='white', font_scale=1)
    plt.figure(figsize=(10,5))
    fig, ax = plt.subplots(figsize=(10,6))
    frames = dict(zip(cohort_name_list, ret_list))
    ax.set_prop_cycle('color',plt.cm.tab10(np.linspace(0,1,9)))

    for k,v in frames.items():
        ax.plot(v[['1']],label=k)

    legend = ax.legend(loc=1, shadow=True, fontsize='large')

    # Put a nicer background color on the legend.
    legend.get_frame().set_facecolor('#00FFCC')
    plt.title('Cohorts: Listeners Retention')
    plt.xticks(np.arange(1, 13.1, 1))
    plt.xlim(1, 13)
    plt.ylabel('Retention Rate');
    plt.xlabel('Cohort Period (Week)')
    plt.show()

    
def clean_for_source(row):
    if row['first_stream_source'] =='album':
        return row['track_album']
    elif row['first_stream_source'] =='others_playlist':
        return None
    
def clean_for_source_pl(row):
    if row['first_stream_source'] =='album':
        return None
    elif row['first_stream_source'] =='others_playlist':
        return row['track_playlist']

In [None]:
TRACK_ID = 'f72fa60c8d9848a393d8ac4bbaa866ef'

In [None]:
df = pd.read_pickle('/project/samples/new/sample_mixed_105k_cleaned2.pickle')

In [None]:
# use only track population
df = df[df.track_id==TRACK_ID]

In [None]:
gc.collect()

## Cohort Analysis

### Retention - How likely are they to keep listening "See You Again?"

In [None]:
ret,cohort = cohort_analysis(df,show_plot=True,mode='week')

In [None]:
ret_album,cohort_album = cohort_analysis(df[df.first_stream_source=='album'],show_plot=True,mode='week')
ret_playlist,cohort_playlist = cohort_analysis(df[df.first_stream_source=='others_playlist'],show_plot=True,mode='week')

comparing cohort based on behavior

In [None]:
ret_album

In [None]:
# pick the cohort week for plot
df.week.value_counts()

In [None]:
df.columns

In [None]:
a = pd.DataFrame(df.drop_duplicates(subset='customer_id').week.value_counts()).reset_index()
a.columns = ['week','Cohort Size']
a = a.sort_values(by='week')
a

In [None]:
df_cust = df.drop_duplicates(subset='customer_id')

In [None]:
df_cust[df_cust.week==16].first_stream_source.value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(6,8))
ax = sns.heatmap(a, annot=True,cmap='Blues',fmt='.0f')
fig.savefig('retention_cohort_size.png',dpi=150)
plt.title('Cohort Size')

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(10,5))
fig, ax = plt.subplots(figsize=(14,8))
frames = {'album':ret_album, 'playlist':ret_playlist}
ax.set_prop_cycle('color',plt.cm.tab10(np.linspace(0,1,9)))
lines = ['-', '--','-.']
i=0
for k,v in frames.items():
    ax.plot(v[['16']],label=k,linestyle=lines[i]) # pick cohort with largest userbase
    i+=1
    
legend = ax.legend(loc=1, shadow=True, fontsize='large')

legend.get_frame().set_facecolor('#00FFCC')

plt.title('Track Retention Rate by First Stream Source', fontsize='large')
plt.xticks(np.arange(1, 7.1, 1))
plt.xlim(1, 7)
plt.ylim(0, 1)
ax.set_yticklabels(['0', '20%', '40%', '60%', '80%', '100%'])
plt.ylabel('Cohort Listening (Retention)',fontsize='large');
plt.xlabel('week', fontsize='large')
plt.show()

In [None]:
fig.savefig('retention_new_data.png',dpi=150)

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(10,5))
fig, ax = plt.subplots(figsize=(14,8))
frames = {'album':ret_album, 'playlist':ret_playlist}
ax.set_prop_cycle('color',plt.cm.tab10(np.linspace(0,1,9)))
lines = ['-', '--']
i=0
for k,v in frames.items():
    ax.plot(v[['15']],label=k,linestyle=lines[i]) # pick cohort with largest userbase
    i+=1
    
legend = ax.legend(loc=1, shadow=True, fontsize='large')

legend.get_frame().set_facecolor('#00FFCC')

plt.title('Retention Rate by First Stream Source', fontsize='large')
plt.xticks(np.arange(1, 7.1, 1))
plt.xlim(1, 7)
plt.ylim(0, 1)
ax.set_yticklabels(['0', '20%', '40%', '60%', '80%', '100%'])
plt.ylabel('Cohort Listening (Retention)',fontsize='large');
plt.xlabel('week', fontsize='large')
plt.show()

### retention on discovery channels -> how likely are they to keep listening to the album/playlist?

In [None]:
df = pd.read_pickle('data/playlist_album_customers_df.pickle')

In [None]:
df.columns

In [None]:
# use only track population
df_track = df[df.track_id==TRACK_ID]

In [None]:
# get album & playlist customer
album_cust = df_track[df_track.first_stream_source=='album'].customer_id.unique()
pl_cust = df_track[df_track.first_stream_source=='others_playlist'].customer_id.unique()

In [None]:
cohort_album = get_cohort_table(df[df.customer_id.isin(album_cust)],mode='week')
cohort_album = cohort_album.groupby(level=0).apply(cohort_period)
ret_album = get_retention(cohort_album)

cohort_pl = get_cohort_table(df[df.customer_id.isin(pl_cust)],mode='week')
cohort_pl = cohort_pl.groupby(level=0).apply(cohort_period)
ret_pl = get_retention(cohort_pl)

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(10,5))
fig, ax = plt.subplots(figsize=(14,8))
frames = {'album':ret_album, 'playlist':ret_pl}
ax.set_prop_cycle('color',plt.cm.tab10(np.linspace(0,1,9)))
lines = ['-', '--']
i=0
for k,v in frames.items():
    ax.plot(v[['16']],label=k,linestyle=lines[i]) # pick cohort with largest userbase
    i+=1
    
legend = ax.legend(loc=1, shadow=True, fontsize='large')

legend.get_frame().set_facecolor('#00FFCC')

plt.title('Discovery Channels Retention Rate by First Stream Source', fontsize='large')
plt.xticks(np.arange(1, 7.1, 1))
plt.xlim(1, 7)
plt.ylim(0, 1)
ax.set_yticklabels(['0', '20%', '40%', '60%', '80%', '100%'])
plt.ylabel('Cohort Listening (Retention)',fontsize='large');
plt.xlabel('week', fontsize='large')
plt.show()

In [None]:
fig.savefig('retention_discovery_channels.png',dpi=150)

In [None]:
import gc
gc.collect()

### Average Stream Count - Do they stream more than other cohorts?

#### Only "See You Again"

In [None]:
TRACK_ID = 'f72fa60c8d9848a393d8ac4bbaa866ef'

In [None]:
# use only track population
df = df[df.track_id==TRACK_ID]

In [None]:
# calculate avg stream per customers -> total_streams / total_customers
cohort['customer_avg_stream'] = round(cohort.total_stream_count.divide(cohort.total_customers),2)
cohort.head()

In [None]:
# get the chosen cohort which is week 16
cohort_16 = cohort[cohort.index.get_level_values(0)=='16']
cohort_16

In [None]:
df_album = df[df.first_stream_source=='album']
df_playlist = df[df.first_stream_source=='others_playlist']

In [None]:
ret_album,cohort_album = cohort_analysis(df_album,show_plot=False,mode='week')
ret_playlist,cohort_playlist = cohort_analysis(df_playlist,show_plot=False,mode='week')

In [None]:
cohort_album['customer_avg_stream'] = round(cohort_album.total_stream_count.divide(cohort_album.total_customers),2)
cohort_playlist['customer_avg_stream'] = round(cohort_playlist.total_stream_count.divide(cohort_playlist.total_customers),2)

In [None]:
cohort_16['avg_stream_album'] = cohort_album[cohort_album.index.get_level_values(0)=='16']['customer_avg_stream']
cohort_16['avg_stream_playlist'] = cohort_playlist[cohort_playlist.index.get_level_values(0)=='16']['customer_avg_stream']
cohort_16

In [None]:
cohort_desc = cohort_16.reset_index()
cohort_desc

In [None]:
cohort_desc = cohort_desc.sort_values(by='week',ascending=False)
cohort_desc

In [None]:
# set yticks for graph positioning
cohort_desc['yaxis_pos'] = range(1,8)
cohort_desc

In [None]:
sns.set_style('darkgrid')

plt.figure(figsize=(10,5))
fig, ax = plt.subplots(figsize=(14,8))

my_range=cohort_desc.yaxis_pos

plt.hlines(y=my_range, xmin=cohort_desc['avg_stream_playlist'], xmax=cohort_desc['avg_stream_album'], color='grey', alpha=0.6)
plt.scatter(cohort_desc['avg_stream_playlist'], my_range, color='darkblue', alpha=0.8, label='playlist cohort',marker="^", s=100)
plt.scatter(cohort_desc['avg_stream_album'], my_range, color='green', alpha=0.8 , label='album cohort',s=100)
plt.legend(fontsize='large', loc=2)
 
plt.yticks(my_range, cohort_desc.cohort_period,fontsize='medium')
plt.title('Comparison of Cohort Average Stream Count for "See You Again"', loc='center', fontsize='large')
plt.xlabel('average streaming per customer', fontsize='medium')
plt.ylabel('Period (week)', fontsize='medium')
plt.xticks(np.arange(0,6.1,1),fontsize='medium')

In [None]:
# save
fig.savefig('avg_stream_see_you_again.png',dpi=150)

#### all songs (discovery channels)

In [None]:
df = pd.read_pickle('/project/exp_output/exp_3007.pickle')
df.head()

In [None]:
gc.collect()

In [None]:
# reload full data
df = pd.read_pickle('/project/samples/new/sample_mixed_105k_cleaned2.pickle')

In [None]:
# create separate df for track streams
df_track = df[df.track_id==TRACK_ID]

In [None]:
# get customer population for album & playlist cohort
in_scope = df_track[df_track.first_stream_source.isin(['album','others_playlist'])][['customer_id','album_name','playlist_id']]
in_scope.head()

In [None]:
# get the customers
df = df[df.customer_id.isin(in_scope.customer_id)]
# limit to albums & playlists population related to see you again track
df = df[(df.album_name.isin(in_scope.album_name)) | (df.playlist_id.isin(in_scope.playlist_id))]

In [None]:
# get album & playlist customer
album_cust = df_track[df_track.first_stream_source=='album'].customer_id.unique()
pl_cust = df_track[df_track.first_stream_source=='others_playlist'].customer_id.unique()

In [None]:
# to get correct cohort group, use df track then marged to main df on customer_id
df_track['id'] = df_track['customer_id']
df_track = df_track.set_index('id')
df_track[['first_listen','first_listen_logtime']] = df_track.groupby(level=0)[['week','logtime']].min()
df_track['cohort_group'] = df_track['first_listen']
df_track['cohort_group'] = df_track['cohort_group'].astype('str')
df_track = df_track.reset_index()

In [None]:
df_track = df_track[['customer_id','cohort_group','first_listen_logtime','album_name','playlist_id','first_stream_source']]
df_track = df_track.drop_duplicates(subset='customer_id')

In [None]:
df_track.rename(columns={'album_name':'track_album', 'playlist_id':'track_playlist'},inplace=True)
df_track.head()

In [None]:
df_track[['track_playlist']] = df_track.apply(lambda row: clean_for_source_pl(row),axis=1)

In [None]:
df_track[['track_album']] = df_track.apply(lambda row: clean_for_source(row),axis=1)

In [None]:
gc.collect()

In [None]:
df_track = df_track[df_track.cohort_group.isin(['15','16','17'])]

In [None]:
df_track

In [None]:
df = df[df.customer_id.isin(df_track.customer_id)]

In [None]:
df = df.merge(df_track, on=['customer_id'],how='left')
df.head()

In [None]:
df = df.drop(['first_stream_source_y'],axis=1)
df.rename(columns={'first_stream_source_x':'first_stream_source'},inplace=True)

In [None]:
# drop streaming prior to "see you again" discovery
df = df[df.logtime >= df.first_listen_logtime]
# drop streaming outside of the customer's own discovery playlists & albums
df = df[(df.playlist_id==df.track_playlist) | (df.album_name==df.track_album)]

In [None]:
df.head()

In [None]:
df.to_pickle('data/playlist_album_customers_df.pickle')

In [None]:
df= pd.read_pickle('data/playlist_album_customers_df.pickle')

In [None]:
gc.collect()

In [None]:
df = df[df.cohort_group=='16']

In [None]:
# get cohort
cohort_album = get_cohort_table(df[df.customer_id.isin(album_cust)],mode='week')
cohort_album = cohort_album.groupby(level=0).apply(cohort_period)

# get cohort
cohort_playlist = get_cohort_table(df[df.customer_id.isin(pl_cust)],mode='week')
cohort_playlist = cohort_playlist.groupby(level=0).apply(cohort_period)

dropping pre discovery 

In [None]:
# get after week 16
df = df[df.week>=16]

In [None]:
df = df.drop(['weekly_stream_count'],axis=1)

In [None]:
gc.collect()

In [None]:
cohort_album['customer_avg_stream'] = round(cohort_album.total_stream_count.divide(cohort_album.total_customers),2)
cohort_playlist['customer_avg_stream'] = round(cohort_playlist.total_stream_count.divide(cohort_playlist.total_customers),2)

In [None]:
cohort_album

In [None]:
cohort_playlist

In [None]:
cohort_album.customer_avg_stream.min(),cohort_album.customer_avg_stream.max()

In [None]:
def get_stream_comparison_chart(cohort_album,cohort_playlist,cohort_group='16'):
    chart_df = get_chart_df(cohort_album,cohort_playlist,cohort_group=cohort_group)
    draw_lollipop_chart(chart_df)

def get_chart_df(cohort_album,cohort_playlist,cohort_group='16'):
    chart_df = pd.DataFrame()
    chart_df['cohort_period'] = cohort_album[cohort_album.index.get_level_values(0)==cohort_group]['cohort_period']
    chart_df['avg_stream_album'] = cohort_album[cohort_album.index.get_level_values(0)==cohort_group]['customer_avg_stream']
    chart_df['avg_stream_playlist'] = cohort_playlist[cohort_playlist.index.get_level_values(0)==cohort_group]['customer_avg_stream']
    # reorder for y-axis positioning
    chart_df = chart_df.reset_index()
    chart_df = chart_df.sort_values(by='week',ascending=False)
    # create column for yticks
    chart_df['yaxis_pos'] = range(1,(len(chart_df)+1))
    return chart_df
    
    
def draw_lollipop_chart(cohort_desc):
    sns.set_style('darkgrid')

    fig, ax = plt.subplots(figsize=(14,8))

    my_range=cohort_desc.yaxis_pos

    plt.hlines(y=my_range, xmin=cohort_desc['avg_stream_playlist'], xmax=cohort_desc['avg_stream_album'], color='grey', alpha=0.6)
    plt.scatter(cohort_desc['avg_stream_playlist'], my_range, color='darkblue', alpha=0.8, label='playlist cohort',marker="^", s=100)
    plt.scatter(cohort_desc['avg_stream_album'], my_range, color='green', alpha=0.8 , label='album cohort',s=100)
    plt.legend(fontsize='large', loc=2)
    plt.yticks(my_range, cohort_desc.cohort_period,fontsize='medium')
    plt.title("Comparison of Cohort Average Stream Count Within Discovery Channels", loc='center', fontsize=13)
    plt.xlabel('average streaming per customer', fontsize='large')
    plt.ylabel('Period (week)', fontsize='large')
    plt.xticks(np.arange(0,20.1,4),fontsize='medium')
    # save
    #fig.savefig('avg_stream_all_playlist_and_albums.png',dpi=150)
    plt.show()
    return fig

In [None]:
fig = get_stream_comparison_chart(cohort_album,cohort_playlist,cohort_group='16')

In [None]:
get_stream_comparison_chart(cohort_album,cohort_playlist,cohort_group='17')