In [14]:
import matplotlib.pyplot as pp
import pandas as pd
import numpy as np
import os

from utils.db_utils import df_from_snowflake
from scipy.optimize import curve_fit

platforms=['android', 'ios']

In [15]:
query = """
select
        date_trunc('day', date) dt,
        os_name platform,
        iff(os_name='ios', 'id' || app_id, app_id) app_id,
        media_source,
        count(distinct iff(event_name = 'install', appsflyer_id, null))+sum(iff(event_name = 'af_skad_install', 1, 0)) total_installs,
        count(distinct iff(event_name = 'install', appsflyer_id, null)) regular_installs,
        sum(iff(event_name = 'af_skad_install', 1, 0)) skad_installs,
        count(distinct iff(event_name = 'retargeting', appsflyer_id, null)) retargeting,
        count(distinct iff(event_name = 'uninstall', appsflyer_id, null)) uninstalls,
        0 as cost
    from stage.reporting_marketing.appsflyer_installs_uninstalls
    where true 
    and date >= '2023-07-01'
    and app_id in (
        'com.hometogo',
        '1104996296'
    )
    group by 1,2,3,4
    order by 1,2,3,4
"""

In [17]:
data = df_from_snowflake(query)

ProgrammingError: 255002: Optional dependency: 'pandas' is not installed, please see the following link for install instructions: https://docs.snowflake.com/en/user-guide/python-connector-pandas.html#installation

In [None]:
data.DT=pd.to_datetime(data.DT)
data['is_paid']=data.MEDIA_SOURCE.isin(['Facebook Ads', 'googleadwords_int', 'Apple Search Ads'])

paid_installs=data.loc[data.is_paid].groupby(['DT', 'PLATFORM']).sum()['TOTAL_INSTALLS'].unstack().resample('1D').sum()
organic_installs=data.loc[data.MEDIA_SOURCE=='organic'].groupby(['DT', 'PLATFORM']).sum()['TOTAL_INSTALLS'].unstack().resample('1D').sum()


In [None]:
def func(X, a ,b):
    return a+b*X

In [None]:
_, ax=pp.subplots(nrows=len(platforms), ncols=2, sharex='col')
for n, platform in enumerate(platforms):
    X=paid_installs[platform].values[:-1].astype(float)
    y=organic_installs[platform].values[:-1].astype(float)
    
    ax[n,0].plot(X, color='blue')
    ax[n,0].plot(y, color='red')
    ax[n,0].set(title=platform)
    
    ## Fit regression line
    popt, pcov = curve_fit(
        func, 
        X, 
        y,
        p0=[0,0],
        #bounds=(0,3)
    )
    x_=np.linspace(X.min(), X.max())
    organic_predicted=func(x_, *popt)
    
    ## plot data points
    ax[n,1].scatter(x=X, y=y, color='C1', alpha=0.4)
    ## plot regression line
    ax[n,1].plot(x_, organic_predicted, linestyle='--')
    
    results.loc[platform, 'r2']=np.corrcoef([X, y]).min()
    results.loc[platform, ['Intercept', 'Coefficient']]=popt
    
print(results)