In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

def transfer_entropy_granger(source, target, lag):
    T = len(source)
    #Initially only some stores were tested with few data points so this formula made sense for stability however it's optional
    bins = max(5, min(int(np.sqrt(T/5)), 20))
    #Source is always in the past as noted by Granger (1980)
    source = source[:-lag]
    past = target[:-lag]
    future = target[lag:]

    min_val = min(source.min(), past.min(), future.min())
    max_val = max(source.max(), past.max(), future.max())
    #edges = np.linspace(min_val , max_val , bins + 1)
    edges = np.linspace(min_val, max_val, bins + 1)
    #This method follows the same logic as the rest, basically join information baseline vs augmented join information
    #target future and past join probability
    p_join_future_past,hist_f_p_x,hist_f_p_y = np.histogram2d(future,
                                         past, bins=edges)
    #Join probability including the source to target future and past
    p_join_future_past_source,edges_hist = np.histogramdd(
          np.column_stack([future, past, source]),
        bins=[edges, edges, edges]
    )
    p_join_future_past_source = p_join_future_past_source/T
    #Join probability including the source to target future and past
    p_join_past_source, hist_p_s_x, hist_p_s_y = np.histogram2d(past, source,
                                  bins=edges)
    p_past, hist_p = np.histogram(past, bins=edges)

    p_future_given_source_past = np.zeros((bins, bins, bins))
    #find the probability of the future given the past already happened
    p_future_given_past = np.zeros((bins, bins))
    for i in range(bins):
        for j in range(bins):
            if p_past[j] > 1e-10:
                p_future_given_past[i, j] = p_join_future_past[i, j] / p_past[j]
            else:
                p_future_given_past[i, j] = 1e-10
    #Find the probability of the target future given that variable source and past happened
    p_future_given_both = np.zeros((bins, bins, bins))
    for i in range(bins):
        for j in range(bins):
            for k in range(bins):
                if p_join_past_source[j, k] > 1e-10:
                    p_future_given_both[i, j, k] = p_join_future_past_source[i, j, k] / p_join_past_source[j, k]
                else:
                    p_future_given_both[i, j, k] = 1e-10
    #Conditional entropy of future given past only 
    entropy_past_to_future = 0.0
    for i in range(bins):
        for j in range(bins):
            if p_future_given_past[i, j] > 1e-10:
                entropy_past_to_future -= p_future_given_past[i, j] * np.log2(p_future_given_past[i, j])
    #Conditional entropy of the future given the past happened and adding the tested source
    entropy_source_past_to_future = 0.0
    for i in range(bins):
        for j in range(bins):
            for k in range(bins):
                if p_future_given_source_past[i, j, k] > 1e-10:
                    entropy_source_past_to_future -= p_future_given_source_past[i, j, k] * np.log2(p_future_given_source_past[i, j, k])
    #Reduction in uncertainty, gives the information gain 
    transfer_entropy = entropy_past_to_future - entropy_source_past_to_future
    return max(0, transfer_entropy/len(future))
# This function had fix variables for reproducibility against timegraph but they might be changed for improvements
# main instrument for improving causality is actually threshold so basic parameters work fine
def entropy_enhanced_random_forest(
    df_path
    ,max_lag=2
    ,n_estimators=100
    ,max_depth=10
    ,min_samples_split=5
    ,min_samples_leaf=2
    ,max_features='sqrt'
    ,random_state=42
):


    df = pd.read_csv(df_path)
    #Initially jsut removing time work for tests, but confounders are not removed by this
    #df = df.drop('time', axis=1)

    # this is where changes need to be manually make, this works for time graph, other test require different handling
    variables = [col for col in df.columns if col.startswith('X')]
    n_vars = len(variables)
    #one matrix for importances, for information weight and one to assign lags
    random_forest_importance_matrix = np.zeros((n_vars, n_vars))
    lag_matrix_importance_entropy_transfer = np.zeros((n_vars, n_vars, max_lag))
    entropy = np.zeros((n_vars, n_vars, max_lag))
    for source_index, source in enumerate(variables):
        for target_index, target in enumerate(variables):
            if source_index != target_index:
                for lag in range(1, max_lag + 1):
                    te = transfer_entropy_granger( df[source].values,
                        df[target].values,
                        lag)
                    entropy[source_index, target_index, lag-1] = te

    lagged_data = {}
    for var in variables:
        for lag in range(1, max_lag + 1):
            lagged_data[f'{var}_lag{lag}'] = df[var].shift(lag)

    df_full = pd.concat([df, pd.DataFrame(lagged_data)], axis=1)
    df_full = df_full.dropna()


    combined_importance_matrix = np.zeros((n_vars, n_vars))
    for target_index, target in enumerate(variables):
        features = []
        pred_lags = {}

        for source_index, source in enumerate(variables):
            if source_index != target_index:
                for lag in range(1, max_lag + 1):
                    col_name = f'{source}_lag{lag}'
                    features.append(col_name)
                    pred_lags[col_name] = (source_index, lag-1)

        X = df_full[features].values
        y = df_full[target].values
        random_forest = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=random_state,
            n_jobs=-1
        )

        random_forest.fit(X, y)
        random_forest_importance = random_forest.feature_importances_

        for index, col in enumerate(features):
            source_index, lag_index = pred_lags[col]
            base = random_forest_importance[index]
            transfer_entropy_value = entropy[source_index, target_index, lag_index]
            transfer_entropy_normalized = transfer_entropy_value / (np.max(entropy)+0.0000001)
            # transfer entropy increased the weights for the random_forest, so a reduced addition made sense
            combined_importance_information = (1 - 0.3) * base + 0.3 * transfer_entropy_normalized

            lag_matrix_importance_entropy_transfer[source_index, target_index, lag_index] = combined_importance_information
            random_forest_importance_matrix[source_index, target_index] += base
            combined_importance_matrix[source_index, target_index] += combined_importance_information
   #it made more sense to have different thresholds taking both statistical methods into account than the most common used
    flat_scores = [combined_importance_matrix[i, j] for i in range(n_vars) for j in range(n_vars) if i != j]

   #lower tresholds leads to wosrt linear results
    sorted_scores = sorted(flat_scores, reverse=True)
    threshold_index = n_vars*2 + 1
    threshold = sorted_scores[threshold_index] if len(sorted_scores) > threshold_index else 0.0
    random_forest_threshold = np.percentile(random_forest_importance_matrix[random_forest_importance_matrix > 0], 70)
    max_transfer_values = np.max(entropy, axis=2)
    transfer_entropy_max = max_transfer_values[max_transfer_values > 0]

    #transfer entropy tends to be to permissive or too restrictive, 50% made sense after some tests or 0 if too restrictive
    if len(transfer_entropy_max) > 0:
        entropy_threshold = np.percentile(transfer_entropy_max, 50)
    else:
        entropy_threshold = 0

    causality_matrix = np.zeros((n_vars, n_vars))
    predicted_lags = np.zeros((n_vars, n_vars))
    print(combined_importance_matrix)
    for i in range(n_vars):
        for j in range(n_vars):
            if i != j:
                random_forest_score = random_forest_importance_matrix[i, j]
                max_transfer_entropy = np.max(entropy[i, j, :])

                if entropy_threshold > 0:
                    if (max_transfer_entropy > entropy_threshold or random_forest_score > random_forest_threshold):
                        if combined_importance_matrix[i,j] > threshold:
                            causality_matrix[i, j] = 1
                else:
                    if random_forest_score > random_forest_threshold:
                        causality_matrix[i, j] = 1

                if causality_matrix[i, j] == 1:
                    best_lag_index = np.argmax(lag_matrix_importance_entropy_transfer[i, j, :])
                    predicted_lags[i, j] = best_lag_index + 1

    return causality_matrix.astype(int), predicted_lags.astype(int)
def prepare_causality_data(df, price_col='amazon_price_unified'):


    price_metrics = [
        f'{price_col}_frequency',
        f'{price_col}_realized_variance',
        f'{price_col}_intraday_range'
    ]


    other_variables = ['store_rank', 'reviews']
    if df['store_rank'].sum() < 0:
        other_variables = ['reviews']
    all_variables = price_metrics + other_variables

    prepared_data = {}

    for category in df['category'].unique():
        category_df = df[df['category'] == category].copy()

        category_data = {}

        for variable in all_variables:
            if variable not in category_df.columns:
                continue

            variable_df = category_df[['date', 'query', variable]].copy()

            pivot_df = variable_df.pivot_table(
                index='date',
                columns='query',
                values=variable,
                aggfunc='first'
            )

            pivot_df.columns = [f"{query}_{variable}" for query in pivot_df.columns]

            category_data[variable] = pivot_df

        prepared_data[category] = category_data

    return prepared_data
df_full = pd.read_csv('/content/drive/MyDrive/transformed_data/amazon_pricing.csv')
df_cat = pd.read_csv('/content/drive/MyDrive/keepa_data/final_df.csv')
df_cat = df_cat[['query','category']].drop_duplicates()
df_analysis = pd.merge(df_full,df_cat,how='left',on=['query'])
df_analysis['date'] = pd.to_datetime(df_analysis['date'])
df_analysis = df_analysis[((df_analysis['date'] >= '2024-09-01') &
        (df_analysis['date'] < '2025-01-01'))]
price_col='amazon_price'
prepared_data = prepare_causality_data(df_analysis, price_col='amazon_price')
metrics = {
    'frequency': f'{price_col}_frequency',
    'variance': f'{price_col}_realized_variance',
    'range': f'{price_col}_intraday_range'
}

max_lag=7

results = []

for category, category_data in prepared_data.items():
    # Get unique queries in this category
    # Query is the name of the product given the extraction of the data from keepa
    # Query = Amazon Product ASIN Code
    queries = set()
    for df in category_data.values():
        # Extract all the products or "queries"
        queries.update([col.rsplit('_', len(col.split('_')) - 1)[0] for col in df.columns])
    for query in queries:
        query_results = {
            'query': query,
            'category': category
        }

        # All values are assigned as 0 unless causality is found
        for metric_name in metrics.keys():
            query_results[f'competitors_cause_{metric_name}'] = 0
            query_results[f'competitors_lag_{metric_name}'] = 0

            query_results[f'rank_causes_{metric_name}'] = 0
            query_results[f'rank_lag_{metric_name}'] = 0

            query_results[f'reviews_cause_{metric_name}'] = 0
            query_results[f'reviews_lag_{metric_name}'] = 0

        # For this research only metric specific causality was calculated
        # hence the causality is done through metric specific loops
        for metric_name, metric_var in metrics.items():
            metric_df = category_data[metric_var]
            target_col = f"{query}_{metric_var}"

            combined_df_list = []
            col_mapping = {}
            col_idx = 0

            # Add target column (query's metric)
            combined_df_list.append(metric_df[[target_col]])
            col_mapping[col_idx] = ('target', metric_var, target_col, query)
            col_idx += 1

            # Add source columns (same metric from competitors)
            for col in metric_df.columns:
                col_query = col.rsplit('_', len(col.split('_')) - 1)[0]
                if col_query != query:  # Only competitors
                    combined_df_list.append(metric_df[[col]])
                    col_mapping[col_idx] = ('competitor', metric_var, col, col_query)
                    col_idx += 1

            # Add rank for the query
            if 'store_rank' in category_data:
                var_df = category_data['store_rank']
                var_col = f"{query}_store_rank"
                if var_col in var_df.columns:
                    combined_df_list.append(var_df[[var_col]])
                    col_mapping[col_idx] = ('rank', 'store_rank', var_col, query)
                    col_idx += 1

            # Add reviews for the query
            if 'reviews' in category_data:
                var_df = category_data['reviews']
                var_col = f"{query}_reviews"
                if var_col in var_df.columns:
                    combined_df_list.append(var_df[[var_col]])
                    col_mapping[col_idx] = ('reviews', 'reviews', var_col, query)
                    col_idx += 1

            combined_df = pd.concat(combined_df_list, axis=1).dropna()


            # Run causality analysis
            causality_matrix, lag_matrix = entropy_enhanced_random_forest(combined_df, max_lag)

            # Process results for this metric
            competitor_lags = []
            num_competitor_causal = 0

            source_indices = [i for i, info in col_mapping.items() if info[0] != 'target']
            target_idx = 0  # Target is always first

            for source_idx in source_indices:
                if source_idx < causality_matrix.shape[0] and target_idx < causality_matrix.shape[1]:
                    if causality_matrix[source_idx, target_idx] == 1:
                        source_info = col_mapping[source_idx]
                        source_type = source_info[0]
                        lag_value = lag_matrix[source_idx, target_idx]

                        # Handle different source types
                        if source_type == 'competitor':
                            num_competitor_causal += 1
                            competitor_lags.append(lag_value)

                        elif source_type == 'rank':
                            query_results[f'rank_causes_{metric_name}'] = 1
                            query_results[f'rank_lag_{metric_name}'] = lag_value

                        elif source_type == 'reviews':
                            query_results[f'reviews_cause_{metric_name}'] = 1
                            query_results[f'reviews_lag_{metric_name}'] = lag_value

            query_results[f'competitors_cause_{metric_name}'] = num_competitor_causal
            #If inside the category there are no competitors then, nothing is calculated
            if competitor_lags:
                query_results[f'competitors_lag_{metric_name}'] = sum(competitor_lags) / len(competitor_lags)
            else:
                query_results[f'competitors_lag_{metric_name}'] = 0

        results.append(query_results)

results_df = pd.DataFrame(results)

base_cols = ['query', 'category']
metric_cols = []

for metric_name in ['frequency', 'variance', 'range']:
    metric_cols.extend([
        f'competitors_cause_{metric_name}',
        f'competitors_lag_{metric_name}',
        f'rank_causes_{metric_name}',
        f'rank_lag_{metric_name}',
        f'reviews_cause_{metric_name}',
        f'reviews_lag_{metric_name}'
    ])


all_cols = base_cols + metric_cols
existing_columns = [col for col in all_cols if col in results_df.columns]
results_df = results_df[existing_columns]