In [1]:
from leading_indicator_v2 import *

load_dotenv('/Users/peter/.env')

# establish Snowflake connection
connection = connector.connect(user=os.getenv("SNOWFLAKE_USERNAME"), 
                  password=os.getenv("SNOWFLAKE_PASSWORD"), 
                  account=os.getenv("SNOWFLAKE_ACCOUNT"), 
                  role=os.getenv("SNOWFLAKE_ROLE"), 
                  warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
                  database = 'DISCO_CORE' 
                  )

In [2]:
df = ingest_clean_data(connection)
scaled_df = scale_data(df)
unattributed_days, attributed_days = remove_last_n_days(scaled_df, 14)
lr_rf_results, linear_reg, best_rf_model = find_model_performance(attributed_days, objective = 'rpl', random_sample=True, training_size= 0.7)
xgb_results, xgb_model = train_xg_model(attributed_days, training_size = 0.7, objective = 'rpl', random_sample=True)
output = predict_unattributed_days(unattributed_days, linear_reg, best_rf_model, xgb_model)
table_name = 'leading_indicator_output'
# create_and_write_db_table(yesterday, table_name)
update_to_db_table(output, table_name)

Ingesting performance data from 2024-04-01
Data successfully ingested.


Unnamed: 0,event_date,bert_boost_classic_brand_displays,contextual_classic_brand_displays,nurture_brand_displays,other_brand_displays,waterfall_brand_displays,bert_boost_classic_zeroday_ad_spend,contextual_classic_zeroday_ad_spend,nurture_zeroday_ad_spend,other_zeroday_ad_spend,waterfall_zeroday_ad_spend,order_count,final_conv_count,final_ad_spend,rpl,day_of_week,day_of_month,rpl_index
68,2024-06-08,163480.0,22044.0,14294.0,32500.0,82742.0,1464.5,49.0,45.0,247.0,209.0,172689,391,6884.5,0.03,5,8,0.853405
69,2024-06-09,153408.0,22225.0,14625.0,27386.0,69867.0,1665.72,10.0,155.0,190.0,498.0,178401,352,5256.4,0.024,6,9,1.324701
70,2024-06-10,209799.0,25563.0,15852.0,36699.0,82838.0,1825.98,32.0,120.0,91.0,168.0,195177,324,5197.08,0.018,0,10,1.209179
71,2024-06-11,252425.0,25354.0,18749.0,41983.0,45370.0,1626.79,110.0,235.0,203.0,105.0,183315,268,4066.21,0.012,1,11,1.179822
72,2024-06-12,267191.0,27731.0,19685.0,40063.0,8251.0,1542.27,76.5,302.0,147.0,43.0,179445,159,2490.77,0.007,2,12,1.079264


### Directionality Output

In [None]:
# print(merged_df.shape)
# print(X_scaled.shape)
# print(xg_features.shape)

direction_df = merged_df[['event_date', 'rpl', 'final_ad_spend']]
#change final_ad_spend to ad_spend
direction_df.rename(columns={'final_ad_spend':'ad_spend'}, inplace=True)
#sort by event_date

#Predict RPL of the x_scaled data and add it to the direction_df
direction_df['rpl_lr_pred'] = rpl_lr_model.predict(X_scaled)
direction_df['rpl_rf_pred'] = rpl_rf_model.predict(X_scaled)
xg_data2 = scaled_df.drop(columns=['event_date', 'final_conv_count', 'final_ad_spend', 'conversion_rate', 'rpl'])
direction_df['rpl_xg_pred'] = optimized_model.predict(xgb.DMatrix(xg_data2))
#Predict Ad Spend of the x_scaled data and add it to the direction_df
direction_df['ad_spend_lr_pred'] = adspend_lr_model.predict(X_scaled)
direction_df['ad_spend_rf_pred'] = adspend_rf_model.predict(X_scaled)

#remove the last 14 days/rows from the direction_df
direction_df = direction_df[:-14]
direction_df.sample(5)

In [None]:
def evaluate_directional_accuracy(input_df):
    df = input_df.copy()
    # Calculate the day-to-day changes and percentage changes for actual and predicted values
    model_columns = [col for col in df.columns if 'pred' in col]
    actual_columns = ['rpl', 'ad_spend']

    for col in actual_columns + model_columns:
        df[f'{col}_change'] = df[col].diff()
        df[f'{col}_pct_change'] = df[col].pct_change()

    # Remove the first row since it cannot have a calculated change
    df = df.iloc[1:]

    # Function to determine if directions match
    def direction_matches(actual_change, pred_change):
        return (actual_change * pred_change > 0) or (actual_change == pred_change == 0)

    # Calculate directional accuracy and account for days with actual changes
    results = []
    for actual_col in actual_columns:
        relevant_models = [col for col in model_columns if col.startswith(actual_col.split('_')[0])]
        for model_col in relevant_models:
            matches = df.apply(lambda row: direction_matches(row[f'{actual_col}_change'], row[f'{model_col}_change']), axis=1)
            actual_changes = df[f'{actual_col}_pct_change'] != 0
            correct_count = matches.sum()
            total_count = matches.count()
            actual_change_count = actual_changes.sum()
            correct_on_change_days = matches[actual_changes].sum()

            results.append({
                'Model': model_col,
                'Model Date': datetime.today().date(),
                'Total Days': total_count,
                'Correct Directional Days': correct_count,
                'Directional Accuracy (%)': round(correct_count / total_count, 3),
                'Days with Actual Change': actual_change_count,
                'Correct on Change Days': correct_on_change_days,
                'Accuracy on Change Days (%)': round(correct_on_change_days / actual_change_count , 3) if actual_change_count != 0 else None
            })

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)
    return results_df

# Example usage with your DataFrame:
# results_df = 
directionality = evaluate_directional_accuracy(direction_df)
directionality.to_csv(f'data/output/directionality_{current_date}.csv', index=False)
directionality

In [None]:
def identify_directional_inaccuracies(input_df):
    df = input_df.copy()
    # Calculate the day-to-day changes for actual and predicted values
    model_columns = [col for col in df.columns if 'pred' in col]
    actual_columns = ['rpl', 'ad_spend']  # Updated to the correct column names

    # Calculate percentage changes for each column
    for col in actual_columns + model_columns:
        df[f'{col}_change'] = df[col].diff()
        df[f'{col}_pct_change'] = df[col].pct_change().round(3)

    # Function to determine if directions match
    def direction_matches(actual_change, pred_change):
        return (actual_change * pred_change > 0) or (actual_change == pred_change == 0)

    # Analyze each row and collect rows with at least one directional inaccuracy
    incorrect_rows = []

    for index, row in df.iterrows():
        mismatch_found = False
        result_row = {'event_date': row['event_date']}
        
        for actual_col in actual_columns:
            # Extract relevant models for each actual value based on column naming
            relevant_models = [col for col in model_columns if col.startswith(actual_col.split('_')[0])]
            for model_col in relevant_models:
                if not direction_matches(row[f'{actual_col}_change'], row[f'{model_col}_change']):
                    mismatch_found = True
                    result_row[f'{actual_col}_actual_pct_change'] = row[f'{actual_col}_pct_change']
                    result_row[f'{model_col}_model_pct_change'] = row[f'{model_col}_pct_change']

        if mismatch_found:
            incorrect_rows.append(result_row)

    # Create a DataFrame from the results
    incorrect_results_df = pd.DataFrame(incorrect_rows)
    #remove the first row since it cannot have a calculated change
    incorrect_results_df = incorrect_results_df.iloc[1:]
    return incorrect_results_df

direction_details = identify_directional_inaccuracies(direction_df)
direction_details.to_csv(f'data/output/direction_details_{current_date}.csv', index=False)    