In [1]:
from pathlib import Path
import pandas as pd

In [13]:
ncbi_metadata = pd.read_csv('/media/NGS/Data_Analysis/20191114_NCBI_Neisseria_meningitidis/Neisseria_meningitidis_tracking_data.csv',
                            usecols=['SourceSeq', 'SourceCountry', 'Clonal_complex', 'gyrA'])

ncbi_metadata = ncbi_metadata[ncbi_metadata['Clonal_complex'].str.contains('44', na=False)]

In [14]:
cdc_metadata = pd.read_excel('/home/chen1i6c04/notebook/20191128_CDC_MiSeq/Neisseria_meningitidis/Nm_153_metadata.xlsx',
                             usecols=['Key', 'Clonal_complex', 'gyrA'])

cdc_metadata = cdc_metadata[cdc_metadata['Clonal_complex'].str.contains('44', na=False)]
cdc_metadata = cdc_metadata.rename(columns={'Key': 'SourceSeq'})
cdc_metadata['SourceCountry'] = 'Taiwan'

In [15]:
metadata = pd.concat([ncbi_metadata, cdc_metadata])

In [16]:
dirpaths = (
    '/media/NGS/Data_Analysis/20191114_NCBI_Neisseria_meningitidis/SRA/Profile/core-genome',
    '/media/NGS/Data_Analysis/20191114_NCBI_Neisseria_meningitidis/Assembly/Profile/core-genome',
    '/media/NGS/Data_Analysis/20191128_CDC_MiSeq/Neisseria_meningitidis/Profile'
)

In [17]:
accs = set(metadata['SourceSeq'])

pfs = []
for dirpath in dirpaths:
    dirPath = Path(dirpath)
    for i in dirPath.iterdir():
        if i.stem in accs:
            pf = pd.read_csv(i, sep='\t', index_col=0)
            pfs.append(pf)

In [18]:
profile = pd.concat(pfs, axis=1)
profile.shape

(1241, 968)

In [33]:
# profile.T.to_csv('CC44_clonal_profile.txt', sep='\t')

In [8]:
import sys
sys.path.append('../../src')

from tree import calculate_distance, Dendrogram, to_bns

## Generate GrapeTree profile

In [25]:
integer_profile = to_bns(profile)

In [26]:
integer_profile = integer_profile.rename(columns={'Key': '#Strain'})

In [27]:
integer_profile.to_csv('CC44_grapetree_profile.txt', sep='\t', index=False)

In [28]:
grapetree_metadata = metadata.rename(columns={'SourceSeq': 'ID'})

In [29]:
grapetree_metadata.to_csv('CC44_grapetree_metadata.txt', sep='\t', index=False)

## Generate Newick file

In [19]:
pdist = calculate_distance(profile)

In [20]:
dendrogram = Dendrogram(pdist, 'average')

In [21]:
dendrogram.cluster()

In [22]:
dendrogram.to_newick('CC44_clonal.newick')

In [None]:
import random

In [None]:
rng = lambda : random.randint(0, 255)

In [None]:
colors = {country: '#%02X%02X%02X'%(rng(), rng(), rng()) for country in set(metadata['SourceCountry'].dropna())}

In [None]:
data = [('TREE_COLORS', '', ''), ('SEPARATOR TAB', '', ''), ('DATA', '', '')]

In [None]:
countries_color = metadata[metadata['SourceCountry'].notna()]

data = []
for i, j in zip(metadata['SourceSeq'], metadata['SourceCountry']):
    if j in colors:
        data.append((i, 'range', colors[j], j))

colors_styles = pd.DataFrame(data)

In [None]:
colors_styles.to_csv('CC4821_colors_styles.txt', sep='\t', index=False, header=False)

In [None]:
metadata[metadata['gyrA'].notna()]['SourceSeq'].to_csv('gyrA_T91I.csv', index=False, header=False)

In [None]:
metadata[metadata['gyrA'] == 'T91I']['SourceSeq'].to_csv('gyrA_T91I.csv', index=False, header=False)