## Confronto famiglie geniche estratte con AFLP e con PanDelos

In [2]:
import os
import pandas as pd

In [6]:
def process_files(folder_a, folder_b, folder_c):

    os.makedirs(folder_c, exist_ok=True)
    
    files_a = {f for f in os.listdir(folder_a) if f.endswith('.xlsx')}
    files_b = {f for f in os.listdir(folder_b) if f.endswith('.xlsx')}
    
    common_files = files_a & files_b
    
    for file_name in common_files:
        print(file_name)
        path_a = os.path.join(folder_a, file_name)
        path_b = os.path.join(folder_b, file_name)
        
        df_a = pd.read_excel(path_a)
        df_b = pd.read_excel(path_b)
        
        # Rinominare colonne per evitare conflitti
        df_a.rename(columns={
            'Rank': 'Rank_PanDelos',
            'Product': 'Gene family',
            'Positive importance': 'Positive importance_PanDelos',
            'Negative importance': 'Negative importance_PanDelos',
        }, inplace=True)
        
        df_b.rename(columns={
            'Rank': 'Rank_AFLP',
            'Positive importance': 'Positive importance_AFLP',
            'Negative importance': 'Negative importance_AFLP'
        }, inplace=True)
        
        # Effettuare il merge sulle colonne corrispondenti
        merged_df = pd.merge(df_a[['Rank_PanDelos', 'Gene family', 'Positive importance_PanDelos', 'Negative importance_PanDelos', 'Gene family PanDelos']],
                             df_b[['Rank_AFLP', 'Gene family', 'Positive importance_AFLP', 'Negative importance_AFLP']],
                             on='Gene family',
                             how='inner')
        
        merged_df = merged_df[['Gene family', 'Rank_PanDelos', 'Rank_AFLP', 'Positive importance_PanDelos', 'Negative importance_PanDelos', 'Positive importance_AFLP', 'Negative importance_AFLP', 'Gene family PanDelos']]
        
        # Ordinare le righe in base alla differenza tra Rank_AFLP e Rank_PanDelos in ordine crescente
        merged_df['Rank_Difference'] = abs(merged_df['Rank_AFLP'] - merged_df['Rank_PanDelos'])
        merged_df = merged_df.sort_values(by='Rank_Difference')
        merged_df = merged_df.drop(columns=['Rank_Difference'])
        
        output_path = os.path.join(folder_c, file_name)
        merged_df.to_excel(output_path, index=False)
        print(f"Salvato: {output_path}")

    return merged_df


fam_gen_dir = "Result/Coverages/Common_Genes"
fam_gen_pandelos_dir = "Result/Coverages/Common_Genes_Pandelos"

output = process_files(fam_gen_pandelos_dir, fam_gen_dir, "Result/Coverages/Genes_matching_AFLP-PanDelos/")

display(output)

AmiDon (starch).xlsx
Salvato: Result/Coverages/Genes_matching_AFLP-PanDelos/AmiDon (starch).xlsx
D-FRUctose.xlsx
Salvato: Result/Coverages/Genes_matching_AFLP-PanDelos/D-FRUctose.xlsx
N-AcetylGlucosamine.xlsx
Salvato: Result/Coverages/Genes_matching_AFLP-PanDelos/N-AcetylGlucosamine.xlsx
L-XYLose.xlsx
Salvato: Result/Coverages/Genes_matching_AFLP-PanDelos/L-XYLose.xlsx
Methyl-αD-Glucopyranoside.xlsx
Salvato: Result/Coverages/Genes_matching_AFLP-PanDelos/Methyl-αD-Glucopyranoside.xlsx
D-TREhalose.xlsx
Salvato: Result/Coverages/Genes_matching_AFLP-PanDelos/D-TREhalose.xlsx
D-ARabitoL.xlsx
Salvato: Result/Coverages/Genes_matching_AFLP-PanDelos/D-ARabitoL.xlsx
potassium GlucoNaTe.xlsx
Salvato: Result/Coverages/Genes_matching_AFLP-PanDelos/potassium GlucoNaTe.xlsx
GENtiobiose.xlsx
Salvato: Result/Coverages/Genes_matching_AFLP-PanDelos/GENtiobiose.xlsx
DULcitol.xlsx
Salvato: Result/Coverages/Genes_matching_AFLP-PanDelos/DULcitol.xlsx
XyLiTol.xlsx
Salvato: Result/Coverages/Genes_matching_AFLP

Unnamed: 0,Gene family,Rank_PanDelos,Rank_AFLP,Positive importance_PanDelos,Negative importance_PanDelos,Positive importance_AFLP,Negative importance_AFLP,Gene family PanDelos
346,copper resistance protein,4025,4025,0.000000,0.000000,,0.258621,"[311937, 37474, 168359, 302292, 388374, 299607..."
582,conjugated bile salt hydrolase,8749,8751,,0.128567,,0.128567,"[207496, 20696]"
889,dihydroxyacetone kinase family protein,12445,12439,,0.055570,,0.065014,"[128432, 47204, 359196]"
43,alkene reductase,432,424,0.101541,0.000000,0.203083,,"[336512, 312321, 351969, 309988, 173798, 35632..."
27,pirin family protein,303,295,0.168151,0.000000,0.280251,,"[173792, 172353, 238016, 313027, 334916, 23763..."
...,...,...,...,...,...,...,...,...
2251,sugar phosphate isomerase epimerase,50722,664,,0.000000,0.116523,0.168560,[299594]
2280,tyrosine phosphatase,52055,1986,,0.000000,,0.458931,"[47810, 346322, 66774]"
2269,translation initiation factor,51486,1155,,0.000000,0.001116,0.224803,"[308920, 219904, 172950, 374849]"
2285,universal stress protein family,52236,596,,0.000000,0.140316,0.265552,"[21416, 197900]"
