# Metachip Metadata preparation

In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import os

In [3]:
df_mchip=pd.DataFrame(columns=["sampleID","mchip_HGT"])

### Counting the number of HGT events for each sample

In [4]:
samples=[]
HGTs=[]

# for all the data directories
for dir in os.listdir("."):
    if dir.startswith("CM_"):
        
        # for all the sample files
        for file in os.listdir(dir):
            
            #extract sample name
            sample=file.split("_")[0]
            
            #open file
            with open(os.path.join(dir,file), "r") as fn:
                c=-1
                
                #count lines
                for line in fn:
                    c+=1
                    
                #store results
                HGTs.append(c)
            samples.append(sample)

#create a temporary dataframe
df_dataset=pd.DataFrame(columns=["sampleID","mchip_HGT"])
df_dataset["sampleID"]=samples
df_dataset["mchip_HGT"]=HGTs
df_dataset.head(1)
print(df_dataset.shape)

(541, 2)


In [5]:
# Concatenate the two dataframes 
df_mchip=pd.concat([df_mchip,df_dataset],ignore_index=True)
df_mchip.tail(5)

Unnamed: 0,sampleID,mchip_HGT
536,C16-20516-TZ,1
537,C16-20602-TZ,39
538,C16-20452-TZ,21
539,C16-20432-TZ,84
540,C16-20145-TZ,21


### Load the metadata table and integrate metachip count

In [7]:
df_metadata=pd.read_csv("../mdata_GhanaTanzania_animals_humans_seq_20231205.txt", sep="\t")

In [8]:
df_metadata=pd.merge(df_metadata,df_mchip, on="sampleID", how="right")
df_metadata.head(1)
print(df_metadata.shape)

(541, 15)


### Integrate info about species richness and sequencing depth

In [9]:
df_richness=pd.read_csv("../sample_richness.txt",sep="\t")
df_richness.head(1)
print(df_richness.shape)

(558, 2)


In [10]:
df_depth=pd.read_csv("../ghanatanzania_stats.txt",sep="\t")
df_depth=df_depth[['file_path','n_of_reads']]

# From file path extract sample name
df_depth['file_path'] = df_depth['file_path'].str.split('/').str[9]

# Change columns name
df_depth.rename(columns={"file_path":"sampleID"}, inplace=True)
df_depth.head(1)
print(df_depth.shape)

(557, 2)


In [11]:
df_metadata=pd.merge(df_metadata,df_richness, on="sampleID", how="right")
df_metadata=pd.merge(df_metadata,df_depth, on="sampleID", how="right")
df_metadata.head(5)

Unnamed: 0,sampleID,subjectID,householdID,family_role,species,date,sex,age_days,age_months,age_years,diarrhea_last24h,location,country,Dataset,mchip_HGT,richness,n_of_reads
0,C16-20292-GH,C16-20292-GH,C16-10332-GH,mother,human,19/7/17,female,9131.0,300.0,25.0,No,Agogo,Ghana,CM_ghana,16,261.0,31579677.0
1,C16-20029-GH,C16-20029-GH,C16-10056-GH,mother,human,1/7/16,female,10226.0,336.0,28.0,No,Agogo,Ghana,CM_ghana,92,447.0,71784691.0
2,C16-20245-GH,C16-20245-GH,C16-10312-GH,sibling,human,4/7/17,female,1826.0,60.0,5.0,No,Agogo,Ghana,CM_ghana,109,330.0,28717616.0
3,C16-20111-GH,C16-20111-GH,C16-10146-GH,sibling,human,17/11/16,female,4382.0,144.0,12.0,No,Agogo,Ghana,CM_ghana,124,393.0,52670192.0
4,C16-20244-GH,C16-20244-GH,C16-10312-GH,sibling,human,4/7/17,male,2191.0,72.0,6.0,No,Agogo,Ghana,CM_ghana,44,223.0,48449322.0


### Drop the entries not analyzed by metachip and save results

In [12]:
df_metadata=df_metadata.dropna(subset="mchip_HGT").reset_index(drop=True)

# save the metadata table
df_metadata.to_csv("metachip_metadata.tsv", sep="\t", index=False)
df_metadata

Unnamed: 0,sampleID,subjectID,householdID,family_role,species,date,sex,age_days,age_months,age_years,diarrhea_last24h,location,country,Dataset,mchip_HGT,richness,n_of_reads
0,C16-20292-GH,C16-20292-GH,C16-10332-GH,mother,human,19/7/17,female,9131.0,300.0,25.0,No,Agogo,Ghana,CM_ghana,16,261.0,31579677.0
1,C16-20029-GH,C16-20029-GH,C16-10056-GH,mother,human,1/7/16,female,10226.0,336.0,28.0,No,Agogo,Ghana,CM_ghana,92,447.0,71784691.0
2,C16-20245-GH,C16-20245-GH,C16-10312-GH,sibling,human,4/7/17,female,1826.0,60.0,5.0,No,Agogo,Ghana,CM_ghana,109,330.0,28717616.0
3,C16-20111-GH,C16-20111-GH,C16-10146-GH,sibling,human,17/11/16,female,4382.0,144.0,12.0,No,Agogo,Ghana,CM_ghana,124,393.0,52670192.0
4,C16-20244-GH,C16-20244-GH,C16-10312-GH,sibling,human,4/7/17,male,2191.0,72.0,6.0,No,Agogo,Ghana,CM_ghana,44,223.0,48449322.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,C16-20365-TZ,C16-20365-TZ,C16-10177-TZ,animal,goat,14/3/17,,,,,,Korogwe,Tanzania,CM_ghanatanzania_animals,133,826.0,139300084.0
535,C16-20632-TZ,C16-20632-TZ,C16-10448-TZ,animal,cow,1/3/18,,,,,,Korogwe,Tanzania,CM_ghanatanzania_animals,0,999.0,43550266.0
536,C16-20591-TZ,C16-20591-TZ,C16-10413-TZ,animal,goat,20/1/18,,,,,,Korogwe,Tanzania,CM_ghanatanzania_animals,0,465.0,9137807.0
537,C16-20409-GH,C16-20409-GH,C16-10395-GH,animal,sheep,20/11/17,,,,,,Agogo,Ghana,CM_ghanatanzania_animals,6,129.0,88082381.0


# Filling taxonomy 

In [62]:
%%bash
dataset="CM_tanzania2"
# Process each file in the directory
for file in /home/giacomo/Thesis-Internship/metachip/${dataset}/*; do
    fn=$(basename $file .txt)
    output_file="/home/giacomo/Thesis-Internship/metachip/${dataset}/${fn}.tsv"
    echo $output_file
    # Read the header line and append the new column name
    header=$(head -n 1 $file)
    echo -e "${header}\ttax1\ttax2\tdirection_tax" > $output_file

    # Process the rest of the file
    tail -n +2 $file | while IFS= read -r line; do
        SGB1=$(echo $line | awk '{print $1}' | awk -F "_" '{print $1"__"$3"__"$5}')
        SGB2=$(echo $line | awk '{print $2}' | awk -F "_" '{print$1"__"$3"__"$5}')
        SGB1_1=$(echo $line | awk '{print $7}' | awk -F "-->" '{print $1}')
        SGB2_1=$(echo $line | awk '{print $7}' | awk -F "-->" '{print $2}' | awk -F "(" '{print $1}')
        TAX1=$(grep -w $SGB1 /home/giacomo/Thesis-Internship/metachip/taxonomy/${dataset}_taxonomy.tsv | cut -f 2)
        TAX2=$(grep -w $SGB2 /home/giacomo/Thesis-Internship/metachip/taxonomy/${dataset}_taxonomy.tsv | cut -f 2) 
        TAX1_1=$(grep -w $SGB1_1 /home/giacomo/Thesis-Internship/metachip/taxonomy/${dataset}_taxonomy.tsv | cut -f 2)
        TAX2_1=$(grep -w $SGB2_1 /home/giacomo/Thesis-Internship/metachip/taxonomy/${dataset}_taxonomy.tsv | cut -f 2)
        direction="${TAX1_1}-->${TAX2_1}"
        echo -e "${line}\t${TAX1}\t${TAX2}\t${direction}" >> $output_file
    done
done


/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20030-TZ_pcofgs_detected_HGTs.tsv
/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20031-TZ_pcofgs_detected_HGTs.tsv
/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20040-TZ_pcofgs_detected_HGTs.tsv
/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20054-TZ_pcofgs_detected_HGTs.tsv
/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20055-TZ_pcofgs_detected_HGTs.tsv
/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20058-TZ_pcofgs_detected_HGTs.tsv
/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20065-TZ_pcofgs_detected_HGTs.tsv
/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20067-TZ_pcofgs_detected_HGTs.tsv
/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20080-TZ_pcofgs_detected_HGTs.tsv
/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20081-TZ_pcofgs_detected_HGTs.tsv
/home/giacomo/Thesis-Internship/metachip/CM_tanzania2/C16-20082-TZ_pcofgs_detect

In [None]:

SGB1=$(awk '{print $7}' | awk -F "-->" '{print $1}') 
$1"__"$3"__"$5