_Import library: pandas, numpy, scipy, sklearn, itertools, statsmodels_

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import itertools
from openpyxl import Workbook

**Read data from CSV**

In [None]:
model_args = {
    'BASE' : "D:\\Dev\\Seabank\\python_scripts\\file\\input\\model_Q124"
    ,'OUTPUT' : "D:\\Dev\\Seabank\\python_scripts\\file\\input\\model_Q124\\output"
    ,'MEV_DATA' : "seabank_data.csv"
    ,'ODR_MOM' : "pd_odr_mom.csv"
    ,'ODR_QOQ' : "pd_odr_qoq.csv"
    ,'ODR_HOH' : "pd_odr_hoh.csv"
    ,'ODR_YOY' : "pd_odr_yoy.csv"
    ,'USE_ODR' : 'hoh'
    ,'p_val_corr': 0.05
}

match model_args['USE_ODR']:
    case 'mom':
        y_odr = pd.read_csv(Path().joinpath(model_args['BASE'], model_args['ODR_MOM']), sep = ";",decimal=".", parse_dates=["pt_date"],dtype={"odr_loan": float})
    case 'qoq':
        y_odr = pd.read_csv(Path().joinpath(model_args['BASE'], model_args['ODR_QOQ']), sep = ";",decimal=".", parse_dates=["pt_date"],dtype={"odr_loan": float})
    case 'hoh':
        y_odr = pd.read_csv(Path().joinpath(model_args['BASE'], model_args['ODR_HOH']), sep = ";",decimal=".", parse_dates=["pt_date"],dtype={"odr_loan": float})
    case 'yoy':
        y_odr = pd.read_csv(Path().joinpath(model_args['BASE'], model_args['ODR_YOY']), sep = ";",decimal=".", parse_dates=["pt_date"],dtype={"odr_loan": float})
    case _:
        raise ValueError("Check variable USE_ODR")

**ODR data loads and multiindex view for ODR variables (y-variable) of each product & tenor.**

In [None]:
y_odr = y_odr.pivot(index="pt_date",columns=["pd_segment", "tenor"], values="odr_loan").drop(axis=1,columns=['Digital_KPL','Digital_EML']).fillna(0.0)
product_df = ['_'.join(map(str,x)) for x in y_odr.columns.to_list()]
odr_df = [y_odr.iloc[:, x].rename('_'.join([product_df[x], model_args['USE_ODR']])) for x in range(y_odr.shape[1])]

**MEV data load and perform Pearson Correlation Test pairwise only for Macroeconomic variable.**

In [24]:
x_mev = pd.read_csv(Path().joinpath(model_args['BASE'], model_args['MEV_DATA']),sep=";",decimal=",", index_col=['Date'],parse_dates=["Date"]).fillna(0.0)
x_mev.index = x_mev.index.to_period("M").to_timestamp("M")

  x_mev = pd.read_csv(Path().joinpath(model_args['BASE'], model_args['MEV_DATA']),sep=";",decimal=",", index_col=['Date'],parse_dates=["Date"]).fillna(0.0)


**Make 2-pairwise x-variable and y-variable combinations list and merge each of ODR to MEV combinations respectively**

In [38]:
permutate = itertools.permutations(x_mev.columns.to_list(), r=2)
new_df = [x_mev.copy()[[x[0],x[1]]] for x in list(permutate)]
new_df = ((y.name, x.merge(y.to_frame(),left_index=True, right_index=True)) for y in odr_df for x in new_df)

**Do Pearson-Correlation Test for each dataset with y-variable**

In [39]:
corr_pairwise = [(i, x[0], x[1].corr(method="pearson")) for i, x in enumerate(new_df.items())]
corr_pairwise[0][2]

49200

In [27]:
len(corr_pairwise)
len(new_df)

1640

**Save to file: pearson-correlation mev & odr_pairwise**

In [22]:
x_corr = x_mev.corr(method="pearson", numeric_only=True)
x_corr.to_csv(Path().joinpath(model_args["OUTPUT"], "pearson_corr_mev"),sep=";", decimal=",")

wb = Workbook()
ws = wb.active
with pd.ExcelWriter(Path().joinpath(model_args["OUTPUT"], "pearson_corr_odr.xlsx"),mode="w") as w:
    # w.book = wb
    # w.sheets=dict((ws.title, ws) for ws in wb.worksheets)

    for item in corr_pairwise:
        i, model, data = item[0], item[1], item[2]

        data.to_excel(w, sheet_name=f'{i}_{model}')