In [1]:
import os
import sys
import pandas as pd

In [2]:
# Import Fedorova ARSA table
path = "/Users/annamontaner/Documents/BSC3/scratch/cli79/cli79334/projects/other/AFR_ARSA_FedorovaL/01_vcf_generation"
imported = pd.read_csv(os.path.join(path,"input_data", "ARSAtableAFR.tsv"), header=None, sep="\t",usecols=range(0,15))

In [None]:
# Check rows and columns of the imported data
imported.shape[0]

In [4]:
# Copy imported dataframe
hg19 = imported.copy()

# Add header
header = [ 'ID','Simons_AFR','Simons_OTHER1','Simons_OTHER2','Simons_OTHER3','CHROM_hg19','POS_hg19','rsID','REF','ALT','1000Gp3_AFR','1000Gp3_EUR','1000Gp3_EAS','1000Gp3_AMR','1000Gp3_SAS']
hg19.columns = header

In [5]:
# Build columns required for vcf
hg19['QUAL'] = "."
hg19['FILTER'] = "NONE"
hg19['INFO'] = 'Simons_AFR='+hg19['Simons_AFR'].astype(str)+";Simons_OTHER1="+hg19['Simons_OTHER1'].astype(str)+";Simons_OTHER2="+hg19['Simons_OTHER2'].astype(str)+";Simons_OTHER3="+hg19['Simons_OTHER3'].astype(str)+";1000Gp3_AFR="+hg19['1000Gp3_AFR'].astype(str)+";1000Gp3_EUR="+hg19['1000Gp3_EUR'].astype(str)+";1000Gp3_EAS="+hg19['1000Gp3_EAS'].astype(str)+";1000Gp3_AMR="+hg19['1000Gp3_AMR'].astype(str)+";1000Gp3_SAS="+hg19['1000Gp3_SAS'].astype(str)

In [6]:
# Generate BED file for coordinate lift-over
bed_tmp_hg19 = hg19[['CHROM_hg19','POS_hg19','ID']]
bed_tmp_hg19['CHROM_hg19'] = bed_tmp_hg19['CHROM_hg19'].replace('CHR','chr', regex=True)
bed_tmp_hg19['POS0_hg19'] = bed_tmp_hg19['POS_hg19'] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bed_tmp_hg19['CHROM_hg19'] = bed_tmp_hg19['CHROM_hg19'].replace('CHR','chr', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bed_tmp_hg19['POS0_hg19'] = bed_tmp_hg19['POS_hg19'] - 1


In [7]:
bed_hg19 = bed_tmp_hg19[['CHROM_hg19','POS0_hg19','POS_hg19','ID']]

# Export hg19 BED file 
bed_hg19.to_csv(os.path.join(path, 'input_data', 'bed_hg19.tsv'), sep="\t", header=False, index=False)


This file is uploaded to the UCSC Genome LiftOver Tool

In [8]:
# Import hg38 BED file
imported_bed_hg38 = pd.read_csv(os.path.join(path, 'input_data', 'bed_hg38.tsv'),sep="\t")

# Select only variants with correct liftover
bed_hg38 = imported_bed_hg38[imported_bed_hg38['hg38_liftover']=='ok']

In [9]:
imported_bed_hg38.shape[0]

77820

In [10]:
bed_hg38.shape[0]

77796

In [13]:
# Join columns of interest in hg19 dataframe 
merged_hg38_hg19 = pd.merge(bed_hg38, hg19[['ID','REF','ALT','QUAL','FILTER','INFO']], on='ID')
merged_hg38_hg19['#CHROM'] = merged_hg38_hg19['#CHROM'].replace('chr','',regex=True)


In [14]:
# Generate ARSA AFR .tsv for hg38
ARSAtableAFR_hg38 = merged_hg38_hg19[['#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO']]

In [15]:
# Sort values
ARSAtableAFR_hg38_sorted = ARSAtableAFR_hg38.sort_values(by=['#CHROM', 'POS'], key=lambda x: x.astype(int))

In [16]:
# Export data
ARSAtableAFR_hg38_sorted.to_csv(os.path.join(path, 'output_data', 'ARSAtableAFR_hg38_sorted.tsv'),sep="\t", index=False)

In [17]:
#### Generate vcf for SnpEff annotation
output_VCF = os.path.join(path,"output_data","ARSAtableAFR_hg38_sorted.vcf")
input_TSV = os.path.join(path,"output_data","ARSAtableAFR_hg38_sorted.tsv")
I = open(input_TSV, "r")
O = open(output_VCF, "w")

O.write("##fileformat=VCFv4.0\n")
O.write("##FILTER=<ID=ID,Number=1,Type=String,Description=\"dbSnp id\">\n")
O.write("##INFO=<ID=Simons_AFR,Number=A,Type=Integer,Description=\"Number of counts in African populations in the Simon's dataset\">\n")
O.write("##INFO=<ID=Simons_OTHER1,Number=A,Type=Integer,Description=\"Number of counts in other population (1) in the Simon's dataset\">\n")
O.write("##INFO=<ID=Simons_OTHER2,Number=A,Type=Integer,Description=\"Number of counts in other population (2) in the Simon's dataset\">\n")
O.write("##INFO=<ID=Simons_OTHER3,Number=A,Type=Integer,Description=\"Number of counts in other population (3) in the Simon's dataset\">\n")
O.write("##INFO=<ID=1000Gp3_AFR,Number=A,Type=Integer,Description=\"Number of counts in African populations in the 1000Genomes phase3 dataset\">\n")
O.write("##INFO=<ID=1000Gp3_EUR,Number=A,Type=Integer,Description=\"Number of counts in European populations in the 1000Genomes phase3 dataset\">\n")
O.write("##INFO=<ID=1000Gp3_EAS,Number=A,Type=Integer,Description=\"Number of counts in East Asian populations in the 1000Genomes phase3 dataset\">\n")
O.write("##INFO=<ID=1000Gp3_AMR,Number=A,Type=Integer,Description=\"Number of counts in American populations in the 1000Genomes phase3 dataset\">\n")
O.write("##INFO=<ID=1000Gp3_SAS,Number=A,Type=Integer,Description=\"Number of counts in South Asian populations in the 1000Genomes phase3 dataset\">\n")
O.write("##FORMAT=<ID=NO,Number=1,Type=String,Description=\"Not used\">\n")

for iLine in I:
	list = iLine.split("\t")
	newLine = '\t'.join(list)
	O.write(newLine)

I.close()
O.close()

In [None]:
# Copy resultant file to AFR_ARSA_FedorovaL/02_snpEff_annotation/input_data for snpEff annotation