# The pipeline : 
***
### A. Load the files
### B. Filter out the Dpo_cluster if present in less than n prophages
### C. Chi2 the Dpo clusters if associated with a give KLtype ; Bonferonni correction
### D. Build the model based on presence absence of one of the associated Dpo clusters
### E. Model evaluation 
***

> A

In [None]:
import os
import pandas as pd 

path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Rafa_task"
path_files = f"{path_work}/sequence_similarity"

KL_types_DF = pd.read_csv(f"{path_work}/DF_prophage_ID.KLtypes.0606.csv", header = 0, sep = "\t", index_col = 0)
# The working file : 
df_proDpo = pd.read_csv(f"{path_files}/prophage_Dpo.cdhit_0.8.csv", header = 0 , sep = "\t", index_col = 0)

***
>B

In [None]:
dpo_filter = 20

df_proDpo_filtered =  df_proDpo[[dpo_cluster for dpo_cluster in df_proDpo.columns if sum(df_proDpo[dpo_cluster]) >= dpo_filter]]

***
>C

The fisher test version : 

In [None]:
def calculate_fisher(df_proDpo, KL_types_DF):
    results = []
    for Dpo_cluster in df_proDpo.columns:
        for KLtype in KL_types_DF.columns:
            # Build contingency table
            contingency_table = pd.crosstab(df_proDpo[Dpo_cluster], KL_types_DF[KLtype])
            # Calculate Fisher's exact test
            _, p = fisher_exact(contingency_table)
            results.append([Dpo_cluster, KLtype, p])
    # Convert results to DataFrame
    results_df = pd.DataFrame(results, columns=['Dpo_cluster', 'KLtype', 'Pvalue'])
    # Correct p-values for multiple testing
    reject, pvals_corrected, _, _ = multipletests(results_df['Pvalue'], method='bonferroni')
    results_df['Pvalue_corrected'] = pvals_corrected
    significant_results_df = results_df[results_df['Pvalue_corrected'] < 0.05]

    return significant_results_df

The Chi2 test version : 

In [None]:
import os 
import pandas as pd
os.environ['OPENBLAS_NUM_THREADS'] = '4'
from scipy.stats import chi2_contingency
from statsmodels.sandbox.stats.multicomp import multipletests
from scipy.stats import fisher_exact

def calculate_chi2(df_proDpo, KL_types_DF):
    results = []
    for KLtype in KL_types_DF.columns:
        for Dpo_cluster in df_proDpo.columns:
            contingency_table = pd.crosstab(df_proDpo[Dpo_cluster], KL_types_DF[KLtype])
            chi2, p, _, _ = chi2_contingency(contingency_table)
            results.append([Dpo_cluster, KLtype, chi2, p])
    results_df = pd.DataFrame(results, columns=['Dpo_cluster', 'KLtype', 'Chi2', 'Pvalue'])
    # Bonferonni correction : 
    results_df['Pvalue_corrected'] = results_df["Pvalue"].apply(lambda x : x * len(df_proDpo.columns))
    significant_results_df = results_df[results_df['Pvalue_corrected'] < 0.05]
    return significant_results_df , results_df

df_result , df_raw = calculate_chi2(df_proDpo_filtered, KL_types_DF)

>D

In [None]:
with open(f"{path_work}/CrossTab.KLtypes.20.tsv","w") as outfile : 
    outfile.write(f"KLtype\t00\t01\t10\t11\tDpo_clusters\n")
    for KLtype in df_result["KLtype"].unique() : 
        dpo_cluster_interest = df_result[df_result["KLtype"] == KLtype]["Dpo_cluster"].values.tolist()
        df_dpo_filtered = df_proDpo_filtered[dpo_cluster_interest].copy()
        df_dpo_filtered["presence_dpo"] = df_dpo_filtered.apply(lambda x : 1 if x.values.sum() > 0 else 0, axis = 1)
        contingency_kl = pd.crosstab(df_dpo_filtered["presence_dpo"], KL_types_DF[KLtype])
        outfile.write(f"{KLtype}\t{contingency_kl[0][0]}\t{contingency_kl[0][1]}\t{contingency_kl[1][0]}\t{contingency_kl[1][1]}\t{','.join(dpo_cluster_interest)}\n")
        print(contingency_kl.values)
        
        

In [None]:
for KLtype in df_result["KLtype"].unique() : 
    dpo_cluster_interest = df_result[df_result["KLtype"] == KLtype]["Dpo_cluster"].values.tolist()
    df_dpo_filtered = df_proDpo_filtered[dpo_cluster_interest].copy()
    print(KLtype , dpo_cluster_interest , "\n")
    df_dpo_filtered["presence_dpo"] = df_dpo_filtered.apply(lambda x : 1 if x.values.sum() > 0 else 0, axis = 1)
    contingency_kl = pd.crosstab(df_dpo_filtered["presence_dpo"], KL_types_DF[KLtype])

In [None]:
df_result[(df_result["KLtype"] == "KL64") & (df_result["Dpo_cluster"] == "Dpo_cdhit_134")]

In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Rafa_task/CrossTab.KLtypes.3.tsv \
/media/concha-eloko/Linux/PPT_clean

rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Rafa_task/CrossTab.KLtypes.20.tsv \
/media/concha-eloko/Linux/PPT_clean