## OUTLINE

#### PART A) RESULTS FROM THE PIPELINES
1- Reading SNP vcf filenames generated as a result of the pipeline.
<br> 2- Creating a set for each dataframe with using "CHROM","POS","REF","ALT_1" columns.

#### PART B) RESULTS OF THE LL VALIDATED VCFS
1- Reading ll validated vcfs.
<br> 2- Creating a set for each dataframe with using "CHROM","POS","REF","ALT_1" columns.

#### PART C) SET OPERATIONS
1- Creating a set operations funtions. 
<br>2- Creating a intersection array from each sets. 
<br>3- Show Intersection Results in heatmap

## IMPORT SECTION

In [78]:
import os
import allel
import numpy as np # pip3 install matplotlib
import matplotlib.pyplot as plt # pip3 install matplotlib

import random
import community # can be installed with: "pip install python-louvain"
from numpy import linalg as LA
from tqdm import tqdm
import pandas as pd
from scipy import stats
from scipy.interpolate import interp1d
import time
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors
import math
import scipy.io as sio
from skimage import data, segmentation, color
from skimage.future import graph

## PART A) RESULTS FROM THE PIPELINES

### 1- Reading SNP vcf filenames generated as a result of the pipeline.

In [79]:
snp_filenames = {}
for root, directories, filenames in os.walk('vcf/snp'):
    for fn in filenames:
        if fn == '.DS_Store':
            continue;
        else:
            snp_filenames[root[8:]] = os.path.join(root,fn)

In [80]:
snp_filenames

{'bwa/varscan': 'vcf/snp/bwa/varscan/snp_SRR7890851_bwa-SRR7890850_bwa_varscan.Somatic.vcf',
 'bwa/mutect': 'vcf/snp/bwa/mutect/snp_SRR7890851_bwa-SRR7890850_bwa_mutect.vcf',
 'bwa/octopus': 'vcf/snp/bwa/octopus/snp_SRR7890851_bwa-SRR7890850_bwa_octopus.vcf',
 'bowtie/varscan': 'vcf/snp/bowtie/varscan/snp_SRR7890851_bowtie-SRR7890850_bowtie_varscan.Somatic.vcf',
 'bowtie/mutect': 'vcf/snp/bowtie/mutect/snp_SRR7890851_bowtie-SRR7890850_bowtie_mutect.vcf',
 'bowtie/octopus': 'vcf/snp/bowtie/octopus/snp_SRR7890851_bowtie-SRR7890850_bowtie_octopus.vcf'}

### 2- Creating a set for each dataframe with using "CHROM","POS","REF","ALT_1" columns.

In [81]:
sets_dict = {}
for key in snp_filenames:
    #Creating a dataframe for each vcf.
    temp = allel.vcf_to_dataframe(snp_filenames[key])
    #Creating a set for each dataframe
    sets_dict[key]= set(temp[["CHROM","POS","REF","ALT_1"]].apply(lambda row: "_".join(map(str, row)), axis=1))

In [82]:
#Example
len(sets_dict)
#sets_dict['bowtie/varscan']

6

## PART B) RESULTS OF THE LL VALIDATED VCFS

### 1- Reading ll validated vcfs.

In [83]:
ll_filenames = {}
for root, directories, filenames in os.walk('vcf/ll_ea_validated_vcfs/ll_val_data'):
    for fn in filenames:
        if fn == '.DS_Store':
            continue;
        else:
            ll_filenames[root[37:]] = os.path.join(root,fn)

In [84]:
ll_filenames

{'novo/somaticSniper': 'vcf/ll_ea_validated_vcfs/ll_val_data/novo/somaticSniper/WES_LL_1.novo.somaticSniper.vcf',
 'novo/strelka': 'vcf/ll_ea_validated_vcfs/ll_val_data/novo/strelka/WES_LL_1.novo.strelka.vcf',
 'novo/mutect': 'vcf/ll_ea_validated_vcfs/ll_val_data/novo/mutect/WES_LL_1.novo.muTect2.vcf',
 'bwa/somaticSniper': 'vcf/ll_ea_validated_vcfs/ll_val_data/bwa/somaticSniper/WES_LL_1.bwa.somaticSniper.vcf',
 'bwa/strelka': 'vcf/ll_ea_validated_vcfs/ll_val_data/bwa/strelka/WES_LL_1.bwa.strelka.vcf',
 'bwa/mutect': 'vcf/ll_ea_validated_vcfs/ll_val_data/bwa/mutect/WES_LL_1.bwa.muTect2.vcf',
 'bowtie/somaticSniper': 'vcf/ll_ea_validated_vcfs/ll_val_data/bowtie/somaticSniper/WES_LL_1.bowtie.somaticSniper.vcf',
 'bowtie/Strelka': 'vcf/ll_ea_validated_vcfs/ll_val_data/bowtie/Strelka/WES_LL_1.bowtie.strelka.vcf',
 'bowtie/mutect': 'vcf/ll_ea_validated_vcfs/ll_val_data/bowtie/mutect/WES_LL_1.bowtie.muTect2.vcf'}

### 2- Creating a set for each dataframe with using "CHROM","POS","REF","ALT_1" columns.

In [85]:
ll_sets_dict = {}
for key in ll_filenames:
    #Creating a dataframe for each vcf.
    temp = allel.vcf_to_dataframe(ll_filenames[key])
    #Creating a set for each dataframe
    ll_sets_dict[key]= set(temp[["CHROM","POS","REF","ALT_1"]].apply(lambda row: "_".join(map(str, row)), axis=1))

In [86]:
#Example
len(ll_sets_dict)
#ll_sets_dict['bowtie/mutect']

9

## PART C) SET OPERATIONS

### 1- Creating a set operations funtions.

In [95]:
#The intersection of two sets is the set of all the common elements of both the sets. 
def intersec(s1,s2):
    return len(s1.intersection(s2))

#The union of two sets is the set of all the elements of both the sets without duplicates.
def uni(s1,s2):
    return len(s1.union(s2))

#The difference between two sets is the set of all the elements in first set that are not present in the second set. 
def dif(s1,s2):
    return len(s1.difference(s2))

#Jaccard Similarity function for two sets
def js_set(s1, s2):
    intersection = len(list(set(s1).intersection(s2)))
    union = (len(s1) + len(s2)) - intersection
    return np.float64(round((float(intersection) / union) * 100) / 100);

### 2- Creating a intersection array from each sets. 

In [121]:
#Initializing a dataframe
df = pd.DataFrame(columns=["LL_VCF", "Result_VCF", "Results"], dtype = float)
#df = pd.DataFrame(columns=["LL_VCF", "Result_VCF", "Results"], dtype = ['category', 'category', 'float'])

#Initializing a count for the rows
count = 0

#Computing each intersection of ll and the pipeline vcfs by looking at their key values. 
for key1 in ll_sets_dict: 
    for key2 in sets_dict:
        df[count] = [key1, key2, js_set(ll_sets_dict[key1],sets_dict[key2])]
        count += 1
    
#Taking transpose of the df and, getting rid of the first three rows then renaming axis again.
#Consider chaging this part.
df_t = df.transpose().iloc[3: , :].set_axis(['LL_VCF', 'Result_VCF', 'Results'], axis=1, inplace=False)

#Normalizing the results values with MinMaxScaler [0,1]
#from sklearn.preprocessing import MinMaxScaler
#df['Results'] = MinMaxScaler().fit_transform(np.array(df['Results']).reshape(-1,1))

#np.float64(df['Results'])
df_t

Unnamed: 0,LL_VCF,Result_VCF,Results
0,novo/somaticSniper,bwa/varscan,0.37
1,novo/somaticSniper,bwa/mutect,0.25
2,novo/somaticSniper,bwa/octopus,0.01
3,novo/somaticSniper,bowtie/varscan,0.27
4,novo/somaticSniper,bowtie/mutect,0.28
5,novo/somaticSniper,bowtie/octopus,0.01
6,novo/strelka,bwa/varscan,0.02
7,novo/strelka,bwa/mutect,0.03
8,novo/strelka,bwa/octopus,0.05
9,novo/strelka,bowtie/varscan,0.02


### 3- Show Intersection Results in heatmap

In [123]:
df_h = df_t.pivot(columns=["LL_VCF", "Result_VCF"],values="Results")
df_h

LL_VCF,novo/somaticSniper,novo/somaticSniper,novo/somaticSniper,novo/somaticSniper,novo/somaticSniper,novo/somaticSniper,novo/strelka,novo/strelka,novo/strelka,novo/strelka,...,bowtie/Strelka,bowtie/Strelka,bowtie/Strelka,bowtie/Strelka,bowtie/mutect,bowtie/mutect,bowtie/mutect,bowtie/mutect,bowtie/mutect,bowtie/mutect
Result_VCF,bwa/varscan,bwa/mutect,bwa/octopus,bowtie/varscan,bowtie/mutect,bowtie/octopus,bwa/varscan,bwa/mutect,bwa/octopus,bowtie/varscan,...,bwa/octopus,bowtie/varscan,bowtie/mutect,bowtie/octopus,bwa/varscan,bwa/mutect,bwa/octopus,bowtie/varscan,bowtie/mutect,bowtie/octopus
0,0.37,,,,,,,,,,...,,,,,,,,,,
1,,0.25,,,,,,,,,...,,,,,,,,,,
2,,,0.01,,,,,,,,...,,,,,,,,,,
3,,,,0.27,,,,,,,...,,,,,,,,,,
4,,,,,0.28,,,,,,...,,,,,,,,,,
5,,,,,,0.01,,,,,...,,,,,,,,,,
6,,,,,,,0.02,,,,...,,,,,,,,,,
7,,,,,,,,0.03,,,...,,,,,,,,,,
8,,,,,,,,,0.05,,...,,,,,,,,,,
9,,,,,,,,,,0.02,...,,,,,,,,,,


In [124]:
ax = sns.heatmap(df_h, annot=True,cmap="YlGnBu")

TypeError: Cannot cast array data from dtype('O') to dtype('int64') according to the rule 'safe'

AttributeError: 'NoneType' object has no attribute 'reshape'

<Figure size 432x288 with 2 Axes>

In [None]:
print(len(sets_dict['bwa/varscan'].difference(sets_dict['bowtie/varscan'])))

 ### Printing all filenames in the snp directory.

In [38]:
snp_filenames = []
for File in os.listdir('vcf/snp'):
    if File == 'bwa':
            for File in os.listdir('vcf/snp/bwa/varscan'):
                #df.append((allel.vcf_to_dataframe('vcf/snp/bwa/varscan/' + File)))
                snp_filenames.append(File)
            for File in os.listdir('vcf/snp/bwa/mutect'):
                snp_filenames.append(File)
            for File in os.listdir('vcf/snp/bwa/octopus'):   
                snp_filenames.append(File)
    elif File == 'bowtie':
            for File in os.listdir('vcf/snp/bowtie/varscan'):
                snp_filenames.append(File)
            for File in os.listdir('vcf/snp/bowtie/mutect'):
                snp_filenames.append(File)
            for File in os.listdir('vcf/snp/bowtie/octopus'):   
                snp_filenames.append(File)

In [39]:
snp_filenames

['snp_SRR7890851_bwa-SRR7890850_bwa_varscan.Somatic.hc.vcf',
 'snp_SRR7890851_bwa-SRR7890850_bwa_varscan.LOH.vcf',
 'snp_SRR7890851_bwa-SRR7890850_bwa_varscan.Germline.vcf',
 'snp_SRR7890851_bwa-SRR7890850_bwa_varscan.vcf',
 'snp_SRR7890851_bwa-SRR7890850_bwa_varscan.Germline.hc.vcf',
 'snp_SRR7890851_bwa-SRR7890850_bwa_varscan.LOH.hc.vcf',
 'snp_SRR7890851_bwa-SRR7890850_bwa_varscan.Somatic.vcf',
 'snp_SRR7890851_bwa-SRR7890850_bwa_mutect.vcf',
 '.DS_Store',
 'snp_SRR7890851_bwa-SRR7890850_bwa_octopus.vcf',
 'snp_SRR7890851_bowtie-SRR7890850_bowtie_varscan.LOH.hc.vcf',
 'snp_SRR7890851_bowtie-SRR7890850_bowtie_varscan.Germline.hc.vcf',
 'snp_SRR7890851_bowtie-SRR7890850_bowtie_varscan.LOH.vcf',
 'snp_SRR7890851_bowtie-SRR7890850_bowtie_varscan.vcf',
 'snp_SRR7890851_bowtie-SRR7890850_bowtie_varscan.Somatic.hc.vcf',
 'snp_SRR7890851_bowtie-SRR7890850_bowtie_varscan.Somatic.vcf',
 'snp_SRR7890851_bowtie-SRR7890850_bowtie_varscan.Germline.vcf',
 'snp_SRR7890851_bowtie-SRR7890850_bowtie_m

In [None]:
fig = px.scatter(df, x= 'CHROM', y = 'POS')
fig.show()

In [None]:
for i in range(len(df.columns)):
    this_column = df.columns[i]
    for key1 in key1_t:
        for key2 in key2_t:
            df[this_column] = [key1, key2]
            #print(key1)

In [None]:
intersection_array = {} 
for key1 in sets_dict:
    for key2 in ll_sets_dict:
        if key1 == key2:
            intersection_array[key1] = intersec(sets_dict[key1], ll_sets_dict[key2])