In [2]:
import pandas as pd

**En este análisis se persigue obtener la expresión de los genes H2H duplicados comprobando si un par si ha especializado en la infección, mientras que el otro par interviene más en el crecimiento del hongo en condiones saprófitas. Para ello, se determinaron los ortogrupos a los que pertenecen los genes H2H duplicados y se obtuvieron los datos de expresión de estos genes en el estudio dónde se compara la condición de infección frente al proceso no infectivo.**

In [3]:
ortho_infection = pd.read_csv("Rhimi59_ort_DE_gra.txt", sep="\t")

In [4]:
ortho_infection

Unnamed: 0,Orthogroup_x_x,Rhimi59_x,Orthogroup_y,Rhimi59_y,log2(FC)_x,log2(FC)_y,P-adj_x,P-adj_y,Unnamed: 8
0,OG0000048,1901119,OG0000019,1901121,-0.59312,0.270961,1.3099999999999999e-30,1.37e-05,
1,OG0000048,1901136,OG0000019,1901131,0.686616,-0.213007,4.15e-11,0.000522692,
2,OG0000048,1927959,OG0000019,1927962,0.592004,-0.645474,2.46e-19,9.629999999999999e-36,
3,OG0001043,1885232,OG0001041,1974744,-1.333756,1.751431,1.52e-08,1.03e-07,
4,OG0001043,1888477,OG0001041,1888485,0.501867,-0.975509,0.120106,0.04099566,
5,OG0001380,1893422,OG0000018,1893414,-1.15098,-1.499119,0.04984538,0.1860705,scaffold_4:1817204-1821649
6,OG0001380,1837685,OG0000018,1928016,0.438574,1.311223,0.02654237,6.33e-08,scaffold_19:309813-314196
7,OG0000099,1874950,OG0000182,1874958,-0.871006,-0.662782,2.88e-05,0.1916987,
8,OG0000099,1913554,OG0000182,1913543,1.769504,-0.892845,0.000107266,0.1319259,
9,OG0000116,1919097,OG0000096,1919106,0.973317,-0.238753,0.004343547,0.6997195,


**Con el código inferior se está obteniendo los genes vecinos (H2H) con el mismo signo en log2FC. Esto significa que, probablemente están co-regulados.**


In [12]:

df = ortho_infection
# filtrar los Orthougroup_x que aparecen más de una vez
df_orthogroup_x = df[df["Orthogroup_x_x"].duplicated(keep=False)]
# filtrar los Orthougroup_y que aparecen más de una vez
df_orthogroup_y = df[df["Orthogroup_y"]. duplicated(keep=False)]

#Filtrar aquellos donde log2FC_x y log2FC_y tienen signo opuesto
df_orthogroup_x_opuestos =df_orthogroup_x[df_orthogroup_x["log2(FC)_x"]*df_orthogroup_x['log2(FC)_y'] > 0]

#Filtrar aquellos donde log2FC_x y log2FC_y tienen signo opuesto
df_orthogroup_y_opuestos =df_orthogroup_y[df_orthogroup_y["log2(FC)_y"]*df_orthogroup_y['log2(FC)_x'] > 0]

# Imprimir los resultados 
print("Orthgroup_x que aparecen más de una vez y tienen log2(FC) con signos opuestos:")
print(df_orthogroup_x_opuestos)

# Imprimir los resultados 
print("Orthgroup_y que aparecen más de una vez y tienen log2(FC) con signos opuestos:")
print(df_orthogroup_y_opuestos)

Orthgroup_x que aparecen más de una vez y tienen log2(FC) con signos opuestos:
   Orthogroup_x_x  Rhimi59_x Orthogroup_y  Rhimi59_y  log2(FC)_x  log2(FC)_y  \
5       OG0001380    1893422    OG0000018    1893414   -1.150980   -1.499119   
6       OG0001380    1837685    OG0000018    1928016    0.438574    1.311223   
7       OG0000099    1874950    OG0000182    1874958   -0.871006   -0.662782   
10      OG0000116    1921031    OG0000096    1921029   -0.978686   -1.868666   

     P-adj_x       P-adj_y                  Unnamed: 8  
5   0.049845  1.860705e-01  scaffold_4:1817204-1821649  
6   0.026542  6.330000e-08   scaffold_19:309813-314196  
7   0.000029  1.916987e-01                         NaN  
10  0.026279  3.400000e-07                         NaN  
Orthgroup_y que aparecen más de una vez y tienen log2(FC) con signos opuestos:
   Orthogroup_x_x  Rhimi59_x Orthogroup_y  Rhimi59_y  log2(FC)_x  log2(FC)_y  \
5       OG0001380    1893422    OG0000018    1893414   -1.150980   -1.499119

**En el siguiente código se van a obtener los grupos de ortogrupos que estén repetidos, se va a comprobar para cada ortogrupo repetido si hay al menos uno con signo positivo y el otro con signo negativo. Y esto se va a hacer con Orthogroup_x_x y con el orthougorup_y.** 
**Con ello, se pretende identificar, como se ha comentado anteriormente, si hay un par específico de levadura y otro de micelio.**

seleccionar los grupos de ortogrupos repetidos donde al menos un gen tenga un log2(FC)_x positivo y al menos otro gen tenga un log2(FC)_x negativo. Si no se cumple esta condición para log2(FC)_x, entonces deseas verificar si al menos un gen tiene un log2(FC)_y positivo y al menos otro gen tiene un log2(FC)_y negativo.

In [18]:
# Obtener los grupos de ortogrupos repetidos
df_repetidos = df[df.duplicated(subset=['Orthogroup_x_x', 'Orthogroup_y'], keep=False)]
print(df_repetidos)
# Inicializar el DataFrame de resultado
df_resultado = pd.DataFrame(columns=df.columns)

# Filtrar aquellos donde al menos un gen tiene log2(FC)_x positivo y al menos otro gen tiene log2(FC)_x negativo
for group, data in df_repetidos.groupby(['Orthogroup_x_x', 'Orthogroup_y']):
    if any(data['log2(FC)_x'] > 0) and any(data['log2(FC)_x'] < 0):
        df_resultado = pd.concat([df_resultado, data])
    elif any(data['log2(FC)_y'] > 0) and any(data['log2(FC)_y'] < 0):
        df_resultado = pd.concat([df_resultado, data])

# Imprimir el resultado
print("Grupos de ortogrupos repetidos con al menos un gen con log2(FC) positivo y otro negativo:")
print(df_resultado)



   Orthogroup_x_x  Rhimi59_x Orthogroup_y  Rhimi59_y  log2(FC)_x  log2(FC)_y  \
0       OG0000048    1901119    OG0000019    1901121   -0.593120    0.270961   
1       OG0000048    1901136    OG0000019    1901131    0.686616   -0.213007   
2       OG0000048    1927959    OG0000019    1927962    0.592004   -0.645474   
3       OG0001043    1885232    OG0001041    1974744   -1.333756    1.751431   
4       OG0001043    1888477    OG0001041    1888485    0.501867   -0.975509   
5       OG0001380    1893422    OG0000018    1893414   -1.150980   -1.499119   
6       OG0001380    1837685    OG0000018    1928016    0.438574    1.311223   
7       OG0000099    1874950    OG0000182    1874958   -0.871006   -0.662782   
8       OG0000099    1913554    OG0000182    1913543    1.769504   -0.892845   
9       OG0000116    1919097    OG0000096    1919106    0.973317   -0.238753   
10      OG0000116    1921031    OG0000096    1921029   -0.978686   -1.868666   
12      OG0000226    1888468    OG000076

In [29]:
df_resultado = df_resultado.rename(columns={"Unnamed: 8": "location"})
df_resultado

Unnamed: 0,Orthogroup_x_x,Rhimi59_x,Orthogroup_y,Rhimi59_y,log2(FC)_x,log2(FC)_y,P-adj_x,P-adj_y,location
0,OG0000048,1901119,OG0000019,1901121,-0.59312,0.270961,1.3099999999999999e-30,1.37e-05,
1,OG0000048,1901136,OG0000019,1901131,0.686616,-0.213007,4.15e-11,0.000522692,
2,OG0000048,1927959,OG0000019,1927962,0.592004,-0.645474,2.46e-19,9.629999999999999e-36,
7,OG0000099,1874950,OG0000182,1874958,-0.871006,-0.662782,2.88e-05,0.1916987,
8,OG0000099,1913554,OG0000182,1913543,1.769504,-0.892845,0.000107266,0.1319259,
9,OG0000116,1919097,OG0000096,1919106,0.973317,-0.238753,0.004343547,0.6997195,
10,OG0000116,1921031,OG0000096,1921029,-0.978686,-1.868666,0.02627882,3.4e-07,
12,OG0000226,1888468,OG0000765,1888471,-0.24426,1.015805,0.2987229,0.05815634,
13,OG0000226,1885225,OG0000765,1885230,0.985159,-2.321667,0.06490424,0.01926029,
3,OG0001043,1885232,OG0001041,1974744,-1.333756,1.751431,1.52e-08,1.03e-07,


In [24]:
gff_Rhimi59 = pd.read_csv("Rhimi59_gff.csv")
gff_Rhimi59["start"]= gff_Rhimi59["start"].astype(int)
gff_Rhimi59["end"]= gff_Rhimi59["end"].astype(int)
gff_Rhimi59

Unnamed: 0,scaffold,start,end,strand,proteinId
0,scaffold_1,22013,23513,-,1000549
1,scaffold_1,40721,41911,+,1000878
2,scaffold_1,63758,65051,+,1001137
3,scaffold_1,94819,96202,+,1001872
4,scaffold_1,94512,95348,-,1001892
...,...,...,...,...,...
10695,scaffold_20,39831,40916,-,2004246
10696,scaffold_53,18492,19402,-,1990294
10697,scaffold_53,13326,14276,-,2012776
10698,scaffold_27,25437,26372,-,2004718


In [30]:
grupos_gff = pd.merge(df_resultado, gff_Rhimi59, left_on="Rhimi59_x", right_on="proteinId")
grupos_gff

Unnamed: 0,Orthogroup_x_x,Rhimi59_x,Orthogroup_y,Rhimi59_y,log2(FC)_x,log2(FC)_y,P-adj_x,P-adj_y,location,scaffold,start,end,strand,proteinId
0,OG0000048,1901119,OG0000019,1901121,-0.59312,0.270961,1.3099999999999999e-30,1.37e-05,,scaffold_6,802608,803107,-,1901119
1,OG0000048,1901136,OG0000019,1901131,0.686616,-0.213007,4.15e-11,0.000522692,,scaffold_6,807690,808055,+,1901136
2,OG0000048,1927959,OG0000019,1927962,0.592004,-0.645474,2.46e-19,9.629999999999999e-36,,scaffold_19,285702,286314,-,1927959
3,OG0000099,1874950,OG0000182,1874958,-0.871006,-0.662782,2.88e-05,0.1916987,,scaffold_1,1306981,1309414,-,1874950
4,OG0000099,1913554,OG0000182,1913543,1.769504,-0.892845,0.000107266,0.1319259,,scaffold_10,159395,161542,+,1913554
5,OG0000116,1919097,OG0000096,1919106,0.973317,-0.238753,0.004343547,0.6997195,,scaffold_12,349982,351877,-,1919097
6,OG0000116,1921031,OG0000096,1921029,-0.978686,-1.868666,0.02627882,3.4e-07,,scaffold_13,144489,146698,+,1921031
7,OG0000226,1888468,OG0000765,1888471,-0.24426,1.015805,0.2987229,0.05815634,,scaffold_3,2057380,2059026,-,1888468
8,OG0000226,1885225,OG0000765,1885230,0.985159,-2.321667,0.06490424,0.01926029,,scaffold_3,593744,595590,-,1885225
9,OG0001043,1885232,OG0001041,1974744,-1.333756,1.751431,1.52e-08,1.03e-07,,scaffold_3,596968,598784,-,1885232


In [31]:
grupos_gff = pd.merge(grupos_gff, gff_Rhimi59, left_on="Rhimi59_y", right_on="proteinId")
grupos_gff

Unnamed: 0,Orthogroup_x_x,Rhimi59_x,Orthogroup_y,Rhimi59_y,log2(FC)_x,log2(FC)_y,P-adj_x,P-adj_y,location,scaffold_x,start_x,end_x,strand_x,proteinId_x,scaffold_y,start_y,end_y,strand_y,proteinId_y
0,OG0000048,1901119,OG0000019,1901121,-0.59312,0.270961,1.3099999999999999e-30,1.37e-05,,scaffold_6,802608,803107,-,1901119,scaffold_6,803527,804101,+,1901121
1,OG0000048,1901136,OG0000019,1901131,0.686616,-0.213007,4.15e-11,0.000522692,,scaffold_6,807690,808055,+,1901136,scaffold_6,806622,807289,-,1901131
2,OG0000048,1927959,OG0000019,1927962,0.592004,-0.645474,2.46e-19,9.629999999999999e-36,,scaffold_19,285702,286314,-,1927959,scaffold_19,286694,287089,+,1927962
3,OG0000099,1874950,OG0000182,1874958,-0.871006,-0.662782,2.88e-05,0.1916987,,scaffold_1,1306981,1309414,-,1874950,scaffold_1,1309476,1310408,+,1874958
4,OG0000099,1913554,OG0000182,1913543,1.769504,-0.892845,0.000107266,0.1319259,,scaffold_10,159395,161542,+,1913554,scaffold_10,158364,159267,-,1913543
5,OG0000116,1919097,OG0000096,1919106,0.973317,-0.238753,0.004343547,0.6997195,,scaffold_12,349982,351877,-,1919097,scaffold_12,351976,353040,+,1919106
6,OG0000116,1921031,OG0000096,1921029,-0.978686,-1.868666,0.02627882,3.4e-07,,scaffold_13,144489,146698,+,1921031,scaffold_13,143391,144463,-,1921029
7,OG0000226,1888468,OG0000765,1888471,-0.24426,1.015805,0.2987229,0.05815634,,scaffold_3,2057380,2059026,-,1888468,scaffold_3,2059080,2059998,+,1888471
8,OG0000226,1885225,OG0000765,1885230,0.985159,-2.321667,0.06490424,0.01926029,,scaffold_3,593744,595590,-,1885225,scaffold_3,595988,597594,+,1885230
9,OG0001043,1885232,OG0001041,1974744,-1.333756,1.751431,1.52e-08,1.03e-07,,scaffold_3,596968,598784,-,1885232,scaffold_3,598856,600403,+,1974744


En el dataframe grupos_gff, se va añadir en la columna de location la información obtenido a partir de las columnas start_x y start_y, end_x, end_y y location_x. Para cada fila se va a comprobar que 
  si el start_x es mayor que el start_y, tiene que añadir a la columna location los valores de location_x:start_y-end_x 
  y si start_x es menor que start_y, tiene que añadir a la columna location los valores de location_x:start_x-end_y

In [34]:
# Definir la función para actualizar la columna 'location'
def update_location(row):
    if row['start_x'] > row['start_y']:
        row['location'] = f"{row['scaffold_x']}:{row['start_y']}-{row['end_x']}"
    else:
        row['location'] = f"{row['scaffold_x']}:{row['start_x']}-{row['end_y']}"
    return row



In [37]:
# Aplicar la función a cada fila del DataFrame
grupos_gff = grupos_gff.apply(update_location, axis=1)
# Mostrar el DataFrame resultante
print(grupos_gff)


   Orthogroup_x_x  Rhimi59_x Orthogroup_y  Rhimi59_y  log2(FC)_x  log2(FC)_y  \
0       OG0000048    1901119    OG0000019    1901121   -0.593120    0.270961   
1       OG0000048    1901136    OG0000019    1901131    0.686616   -0.213007   
2       OG0000048    1927959    OG0000019    1927962    0.592004   -0.645474   
3       OG0000099    1874950    OG0000182    1874958   -0.871006   -0.662782   
4       OG0000099    1913554    OG0000182    1913543    1.769504   -0.892845   
5       OG0000116    1919097    OG0000096    1919106    0.973317   -0.238753   
6       OG0000116    1921031    OG0000096    1921029   -0.978686   -1.868666   
7       OG0000226    1888468    OG0000765    1888471   -0.244260    1.015805   
8       OG0000226    1885225    OG0000765    1885230    0.985159   -2.321667   
9       OG0001043    1885232    OG0001041    1974744   -1.333756    1.751431   
10      OG0001043    1888477    OG0001041    1888485    0.501867   -0.975509   
11      OG0001380    1893422    OG000001

In [38]:
grupos_gff.to_csv("grupos_gff_infection.csv")