In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from dowhy import CausalModel
import warnings
import os

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cstmr = pd.read_csv("./cstmr_june_22_dowhy.csv")

In [4]:
outcome = [f'Flag_{i}' for i in range(6, 18)]

common_causes = ["AGE", "BMI", "CONTRACT_START_DATE_DOY", 'SEX_Female', 'CLUB_ID', 
                'MEMBERSHIP_CATEGORY_NAME', 'TECRUBE', 'GECE_PROB','OGLEN_PROB',
                'AKSAM_PROB', 'SABAH_PROB','IKINDI_PROB'] 

In [7]:
cols = ["TECRUBE", 'FORM', 'SIKLIK'] #EXPERIENCE, FORM, FREQUENCY
for col in cols:
    cstmr = cstmr[cstmr[col].isna()== False]

In [8]:

treatment_v3 = [
    'FORM>10',
    'FORM>30',
    'TECRUBE>0',
    'TECRUBE>1',
    'TECRUBE>2',
    'SIKLIK>1',
    'SIKLIK>2'
]

common_causes.remove('TECRUBE')

outcome = [f'Flag_{i}' for i in range(6, 18)]

# Ensure the file has a header row
file_path = "./dowhy-tests/dowhy-tests-gtkb-june22.csv"
if not os.path.exists(file_path):
    with open(file_path, "w") as f:
        f.write("flag,column,estimated_effect,new_effect_with_random_cause,p_value\n")

for out in tqdm(outcome):
    for t in treatment_v3:
        model = CausalModel(
            data=cstmr,
            treatment=t,
            outcome=out,
            common_causes=common_causes,
        )
        identified_estimand = model.identify_effect()
        estimate = model.estimate_effect(identified_estimand, method_name="backdoor.propensity_score_matching", confidence_intervals=False)
        refute_results = model.refute_estimate(identified_estimand, estimate, method_name="random_common_cause", n_jobs=24)
        est, new, p_val = refute_results.estimated_effect, refute_results.new_effect, refute_results.refutation_result["p_value"]
        
        with open(file_path, "a") as f:
            f.write(f"{out},{t},{est},{new},{p_val}\n")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [4:55:53<00:00, 1479.44s/it]


### Clusters

In [9]:
cstmr = pd.read_csv("./cstmr_june_22_dowhy.csv")

In [10]:
# Morning (sabah), Night (gece), Afternoon (ikindi), Noon (öğlen), Male (erkek), Female (kadın)

sabahci_df = cstmr[cstmr["MAX_CLUSTER"] == "SABAH"]
gece_df = cstmr[cstmr["MAX_CLUSTER"] == "GECE"]
aksam_df = cstmr[cstmr["MAX_CLUSTER"] == "AKSAM"]
ikindi_df = cstmr[cstmr["MAX_CLUSTER"] == "IKINDI"]
oglen_df = cstmr[cstmr["MAX_CLUSTER"] == "OGLEN"]
erkek_df = cstmr[cstmr["SEX_Female"] == 0]
kadin_df = cstmr[cstmr["SEX_Female"] == 1]


print(sabahci_df.shape)
print(gece_df.shape)
print(aksam_df.shape)
print(ikindi_df.shape)
print(oglen_df.shape)
print(erkek_df.shape)
print(kadin_df.shape)

(25301, 478)
(71370, 478)
(35764, 478)
(39835, 478)
(27513, 478)
(144747, 478)
(55036, 478)


In [11]:
outcome = [f'Flag_{i}' for i in range(6, 18)]

common_causes = ["AGE", "BMI", "CONTRACT_START_DATE_DOY", 'SEX_Female', 'CLUB_ID', 
                'MEMBERSHIP_CATEGORY_NAME', 'TECRUBE']

### Master Treatment

In [12]:
df_names = [sabahci_df, gece_df, aksam_df, ikindi_df, oglen_df]
file_names = ["SABAH","GECE","AKSAM","IKINDI","OGLEN"]
for idx in range(len(df_names)):
    treatment_v1 = [
        'GROUP_LESSONS_6_WEEKS_0',
        'GROUP_LESSONS_6_WEEKS_1',
        'GROUP_LESSONS_6_WEEKS_2',
        'GROUP_LESSONS_6_WEEKS_3',
        'PT_6_WEEKS_0',
        'PT_6_WEEKS_1',
        'PT_6_WEEKS_2',
        'PT_6_WEEKS_3',
        'DIFF_VST_6_0',
        'DIFF_VST_6_1',
        'DIFF_VST_6_2',
        'DIFF_VST_6_3',
        'DIFF_GROUP_LESSON_6_0',
        'DIFF_GROUP_LESSON_6_1',
        'DIFF_GROUP_LESSON_6_2',
        'DIFF_GROUP_LESSON_6_3',
        'CREDIT_6_WEEKS_0',
        'CREDIT_6_WEEKS_1',
        'CREDIT_6_WEEKS_2',
        'CREDIT_6_WEEKS_3',
        'PAIRS_6_WEEKS_0',
        'PAIRS_6_WEEKS_1',
        'PAIRS_6_WEEKS_2',
        'PAIRS_6_WEEKS_3',
    ]

    outcome = [f'Flag_{i}' for i in range(6, 18)]

    # Ensure the file has a header row
    file_path = "./dowhy-tests/dowhy-tests-june22_" + file_names[idx] + ".csv"
    if not os.path.exists(file_path):
        with open(file_path, "w") as f:
            f.write("flag,column,estimated_effect,new_effect_with_random_cause,p_value\n")

    for t in tqdm(treatment_v1):
        for out in outcome:
            model = CausalModel(
                data=df_names[idx],
                treatment=t,
                outcome=out,
                common_causes=common_causes,
            )
            identified_estimand = model.identify_effect()
            estimate = model.estimate_effect(identified_estimand, method_name="backdoor.propensity_score_matching", confidence_intervals=False)
            refute_results = model.refute_estimate(identified_estimand, estimate, method_name="random_common_cause", n_jobs=24)
            est, new, p_val = refute_results.estimated_effect, refute_results.new_effect, refute_results.refutation_result["p_value"]
            
            with open(file_path, "a") as f:
                f.write(f"{out},{t},{est},{new},{p_val}\n")


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [5:16:03<00:00, 790.16s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [15:56:01<00:00, 2390.07s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [6:28:24<00:00, 971.02s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [7:30:23<00:00, 1125.99s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [5:13:43<00:00, 784.32s/it]


### GTKB as Predictors - Dummies

In [13]:
cols = ["TECRUBE", 'FORM', 'SIKLIK']
for col in cols:
    cstmr = cstmr[cstmr[col].isna()== False]

sabahci_df = cstmr[cstmr["MAX_CLUSTER"] == "SABAH"]
gece_df = cstmr[cstmr["MAX_CLUSTER"] == "GECE"]
aksam_df = cstmr[cstmr["MAX_CLUSTER"] == "AKSAM"]
ikindi_df = cstmr[cstmr["MAX_CLUSTER"] == "IKINDI"]
oglen_df = cstmr[cstmr["MAX_CLUSTER"] == "OGLEN"]


In [14]:
df_names = [sabahci_df, gece_df, aksam_df, ikindi_df, oglen_df]
file_names = ["SABAH","GECE","AKSAM","IKINDI","OGLEN"]
common_causes.remove('TECRUBE')

for idx in range(len(df_names)):
    treatment_v3 = [
        'FORM>10',
        'FORM>30',
        'TECRUBE>0',
        'TECRUBE>1',
        'TECRUBE>2',
        'SIKLIK>1',
        'SIKLIK>2'
    ]
    outcome = [f'Flag_{i}' for i in range(6, 18)]

    # Ensure the file has a header row
    file_path = "./dowhy-tests/dowhy-tests-june22_" + file_names[idx] + ".csv"
    if not os.path.exists(file_path):
        with open(file_path, "w") as f:
            f.write("flag,column,estimated_effect,new_effect_with_random_cause,p_value\n")

    for out in tqdm(outcome):
        for t in treatment_v3:
            model = CausalModel(
                data=df_names[idx],
                treatment=t,
                outcome=out,
                common_causes=common_causes,
            )
            identified_estimand = model.identify_effect()
            estimate = model.estimate_effect(identified_estimand, method_name="backdoor.propensity_score_matching", confidence_intervals=False)
            refute_results = model.refute_estimate(identified_estimand, estimate, method_name="random_common_cause", n_jobs=24)
            est, new, p_val = refute_results.estimated_effect, refute_results.new_effect, refute_results.refutation_result["p_value"]
            
            with open(file_path, "a") as f:
                f.write(f"{out},{t},{est},{new},{p_val}\n")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [28:13<00:00, 141.11s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [1:18:54<00:00, 394.58s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [40:46<00:00, 203.85s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [42:35<00:00, 212.92s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [29:22<00:00, 146.84s/it]


### Gender

In [15]:
cstmr = pd.read_csv("./cstmr_may_30_dowhy.csv")

erkek_df = cstmr[cstmr["SEX_Female"] == 0]
kadin_df = cstmr[cstmr["SEX_Female"] == 1]

In [16]:
outcome = [f'Flag_{i}' for i in range(6, 18)]

common_causes = ["AGE", "BMI", "CONTRACT_START_DATE_DOY", 'CLUB_ID', 
                'MEMBERSHIP_CATEGORY_NAME', 'TECRUBE', 'GECE_PROB','OGLEN_PROB',
                'AKSAM_PROB', 'SABAH_PROB','IKINDI_PROB']

In [17]:
df_names = [erkek_df, kadin_df]
file_names = ["ERKEK","KADIN"]
for idx in range(len(df_names)):
    treatment_v1 = [
        'GROUP_LESSONS_6_WEEKS_0',
        'GROUP_LESSONS_6_WEEKS_1',
        'GROUP_LESSONS_6_WEEKS_2',
        'GROUP_LESSONS_6_WEEKS_3',
        'PT_6_WEEKS_0',
        'PT_6_WEEKS_1',
        'PT_6_WEEKS_2',
        'PT_6_WEEKS_3',
        'DIFF_VST_6_0',
        'DIFF_VST_6_1',
        'DIFF_VST_6_2',
        'DIFF_VST_6_3',
        'DIFF_GROUP_LESSON_6_0',
        'DIFF_GROUP_LESSON_6_1',
        'DIFF_GROUP_LESSON_6_2',
        'DIFF_GROUP_LESSON_6_3',
        'CREDIT_6_WEEKS_0',
        'CREDIT_6_WEEKS_1',
        'CREDIT_6_WEEKS_2',
        'CREDIT_6_WEEKS_3',
        'PAIRS_6_WEEKS_0',
        'PAIRS_6_WEEKS_1',
        'PAIRS_6_WEEKS_2',
        'PAIRS_6_WEEKS_3',
    ]

    outcome = [f'Flag_{i}' for i in range(6, 18)]

    # Ensure the file has a header row
    file_path = "./dowhy-tests/dowhy-tests-june22_" + file_names[idx] + ".csv"
    if not os.path.exists(file_path):
        with open(file_path, "w") as f:
            f.write("flag,column,estimated_effect,new_effect_with_random_cause,p_value\n")

    for t in tqdm(treatment_v1):
        for out in outcome:
            model = CausalModel(
                data=df_names[idx],
                treatment=t,
                outcome=out,
                common_causes=common_causes,
            )
            identified_estimand = model.identify_effect()
            estimate = model.estimate_effect(identified_estimand, method_name="backdoor.propensity_score_matching", confidence_intervals=False)
            refute_results = model.refute_estimate(identified_estimand, estimate, method_name="random_common_cause", n_jobs=24)
            est, new, p_val = refute_results.estimated_effect, refute_results.new_effect, refute_results.refutation_result["p_value"]
            
            with open(file_path, "a") as f:
                f.write(f"{out},{t},{est},{new},{p_val}\n")


100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [31:07:02<00:00, 4667.62s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [10:24:15<00:00, 1560.66s/it]


In [18]:
cols = ["TECRUBE", 'FORM', 'SIKLIK']
for col in cols:
    cstmr = cstmr[cstmr[col].isna()== False]

erkek_df = cstmr[cstmr["SEX_Female"] == 0]
kadin_df = cstmr[cstmr["SEX_Female"] == 1]

In [19]:
df_names = [erkek_df, kadin_df]
file_names = ["ERKEK","KADIN"]
common_causes.remove('TECRUBE')
for idx in range(len(df_names)):
    treatment_v3 = [
        'FORM>10',
        'FORM>30',
        'TECRUBE>0',
        'TECRUBE>1',
        'TECRUBE>2',
        'SIKLIK>1',
        'SIKLIK>2'
    ]

    outcome = [f'Flag_{i}' for i in range(6, 18)]

    # Ensure the file has a header row
    file_path = "./dowhy-tests/dowhy-tests-june22_" + file_names[idx] + ".csv"
    if not os.path.exists(file_path):
        with open(file_path, "w") as f:
            f.write("flag,column,estimated_effect,new_effect_with_random_cause,p_value\n")

    for out in tqdm(outcome):
        for t in treatment_v3:
            model = CausalModel(
                data=df_names[idx],
                treatment=t,
                outcome=out,
                common_causes=common_causes,
            )
            identified_estimand = model.identify_effect()
            estimate = model.estimate_effect(identified_estimand, method_name="backdoor.propensity_score_matching", confidence_intervals=False)
            refute_results = model.refute_estimate(identified_estimand, estimate, method_name="random_common_cause", n_jobs=24)
            est, new, p_val = refute_results.estimated_effect, refute_results.new_effect, refute_results.refutation_result["p_value"]
            
            with open(file_path, "a") as f:
                f.write(f"{out},{t},{est},{new},{p_val}\n")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [2:33:26<00:00, 767.25s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [1:02:28<00:00, 312.38s/it]
