### Problématique : Transformation des données en time serie, comment les utiliser?
Objectif : On veut essayer de différencier les utilisateurs en utilisant la série temporelle lié aux événements générés dans l'application, via :
* un clustering de serie temporelle
* une utilisation de l’information comme une “anomaly détection” dans le futur dans une IOT
> l'anomalie correspondrait à une prise de souscription

### Import des librairies

In [None]:
import pandas as pd
import numpy as np

In [None]:
import os

def read_files(folder_path):
    files = sorted([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
    dataframes = []
    count = 0
    for file in files:
        print(file)
        file_path = os.path.join(folder_path, file)
        df = pd.read_parquet(file_path)
        dataframes.append(df)
    return dataframes


def merge_dataframes(dataframes):
    if len(dataframes) <= 10:
        df = dataframes[0]
        for i in range(1, len(dataframes)):
            df = df.merge(dataframes[i], how='left', on='amplitude_id')
        return df
    else:
        merged_dataframes = []
        print(len(dataframes))
        for i in range(0, len(dataframes), 10):
            merged_df = merge_dataframes(dataframes[i:i+10])
            merged_dataframes.append(merged_df)
        return merge_dataframes(merged_dataframes)

# Agrégation : max 

### Dictionnaire explicant la préparation de la dataframe

In [None]:
# Changed and ordered the events putting payment events as 6 and cancellation as 7
events_funnel = {
    'first_app_open':1,
    'app_open':1,
    'session_start':1,
    'session_end':1,
    'onboarding_start':2,
    'onboarding_page_view':2,
    'onboarding_tap_continue':2,
    'onboarding_finish':2,
    'page_view':3,
    'ac_click':3,
    'ac_content_click':3,
    'ac_content_routine_cancel':3,
    'ac_sound_off':3,
    'ac_sound_on':3,
    'ac_timer_delay_set':3,
    'play_sound_fail':3,
    'ac_reset_statistics':3,
    'payment_restore_fail':3,
    'Tap_share_instagram_fail':3,
    'Tap_share_instagram_success':3,
    'Tap_share_twitter_success':3,
    'na_att_deactivate':3,
    'ac_daily_reminder_deactivate':3,
    'na_notification_deactivate':3,
    'ac_daily_reminder_activate':4,
    'na_notification_activate':4,
    'ac_content_routine_play':4,
    'na_att_activate':4,
    'ac_content_routine_start':4,
    'ac_content_routine_pause':4,
    'ac_content_routine_finish':4,
    'ac_content_exercise_view_instructions':4,
    'ac_content_custom_routine_create_start':4,
    'ac_content_exercise_skip':4,
    'ac_content_exercise_previous':4,
    'ac_content_custom_routine_create_finish':4,
    'ac_modal_next':4,
    'ac_app_icon_change':4,
    'rc_trial_cancelled_event':5,
    'rc_uncancellation_event':5,
    'page_view_paywall':5,
    'payment_start':5,
    'payment_failure':5,
    'payment_cancel':5,
    'rc_trial_started_event':6,
    'rc_initial_purchase_event':6,
    'payment_finish':6,
    'payment_restore':6,
    'rc_renewal_event':6,
    'rc_non_subscription_purchase_event':6,
    'rc_trial_converted_event':6,
    'rc_cancellation_event':7}


### dataframe by minutes

In [None]:
df_minutes = pd.read_parquet('app_timeserie_min_agg_max_data_20230130.parquet', engine='pyarrow')
df_minutes.head()
print("Taille du dataframe : ", df_minutes.shape)

### dataframe by hours

In [None]:
df_hours = pd.read_parquet('app_timeserie_hourly_agg_max_data_20230130.parquet', engine='pyarrow')
df_hours.head()
print("Taille du dataframe : ", df_hours.shape)

### dataframe by seconds

In [None]:
df_seconds = concatenate_files('data_sec_timeframe_agg_max')
df_seconds.head()
print("Taille du dataframe : ", df_seconds.shape)

# Agrégation : sum 

### Dictionnaire explicant la préparation de la dataframe

In [None]:
# By funnel
# 1 => amplitude automatic events
# 2 => onboarding
# 3 => usage of the app
# 4 => important actions in the app
# 5 => payment actions

events_funnel = {
    'session_start':1,
    'onboarding_start':2,
    'onboarding_page_view':2,
    'first_app_open':1,
    'onboarding_tap_continue':2,
    'page_view':3,
    'ac_click':3,
    'na_att_deactivate':3,
    'ac_daily_reminder_activate':4,
    'na_notification_activate':4,
    'onboarding_finish':2,
    'page_view_paywall':5,
    'payment_start':5,
    'payment_failure':5,
    'payment_cancel':5,
    'ac_daily_reminder_deactivate':3,
    'na_notification_deactivate':3,
    'ac_content_click':3,
    'ac_content_routine_play':4,
    'session_end':1,
    'na_att_activate':4,
    'app_open':1,
    'ac_content_routine_cancel':3,
    'ac_content_routine_start':4,
    'ac_app_icon_change':4,
    'rc_initial_purchase_event':5,
    'payment_finish':5,
    'ac_content_routine_finish':4,
    'ac_content_routine_pause':4,
    'ac_content_exercise_view_instructions':4,
    'rc_trial_started_event':5,
    'rc_cancellation_event':5,
    'ac_sound_off':3,
    'ac_sound_on':3,
    'ac_timer_delay_set':3,
    'ac_content_custom_routine_create_start':4,
    'ac_content_exercise_previous':4,
    'ac_content_exercise_skip':4,
    'payment_restore':5,
    'ac_content_custom_routine_create_finish':4,
    'ac_modal_next':4,
    'rc_renewal_event':5,
    'rc_trial_cancelled_event':5,
    'play_sound_fail':3,
    'ac_reset_statistics':3,
    'payment_restore_fail':3,
    'Tap_share_instagram_fail':3,
    'rc_non_subscription_purchase_event':5,
    'rc_trial_converted_event':5,
    'rc_uncancellation_event':5,
    'Tap_share_instagram_success':3,
    'Tap_share_twitter_success':3
}

### dataframe by minutes

In [None]:
df_minutes = pd.read_parquet('app_timeserie_min_data_20230130.parquet', engine='pyarrow')
df_minutes.head()
print("Taille du dataframe : ", df_minutes.shape)

### dataframe by hours

In [None]:
df_hours = pd.read_parquet('app_timeserie_hourly_data_20230130.parquet', engine='pyarrow')
df_hours.head()
print("Taille du dataframe : ", df_hours.shape)

### dataframe by seconds

In [None]:
df_seconds = concatenate_files('data_by_minutes')
df_seconds.head()
print("Taille du dataframe : ", df_seconds.shape)