In [1]:
import seaborn as sns
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
plt.rcParams['figure.figsize'] = [12, 10]

## Blocks 

In [3]:
#load data
block = pd.read_table("inversion_block_info_all.txt", sep='\t')

#sort block files based on taxa
ATCC_block = block[block.taxa=='NZ_CP009072'].sort_values(by='start').reset_index(drop=True)
BW25113_block = block[block.taxa=='NZ_CP009273'].sort_values(by='start').reset_index(drop=True)
K12DH_block = block[block.taxa=='NC_010473'].sort_values(by='start').reset_index(drop=True)
K12MG_block = block[block.taxa=='U00096'].sort_values(by='start').reset_index(drop=True)

#fixing rows that were not parse properly
fix = block.taxa.unique()[4:]
df = pd.DataFrame({})
for i in fix:
    df = df.append(block.loc[block.taxa==i])

df['block2'] = df['block'].apply(lambda x : x.split('NZ')[1])
df['block'] = df['block'].apply(lambda x : x.split('NZ')[0])

df = df.drop(['inversion'],1)
df.columns = ['block','start', 'end', 'rev_comp', 'inversion', 'taxa']
df = df[['block', 'taxa', 'start', 'end','rev_comp', 'inversion']]

df['taxa'] = df['taxa'].apply(lambda x : 'NZ'+x)

#add fixed rows into the block files
ATCC_block = ATCC_block.append(df.loc[df['taxa']=="NZ_CP009072"])
BW25113_block = BW25113_block.append(df.loc[df['taxa']=="NZ_CP009273"])

ATCC_block['start'] = pd.to_numeric(ATCC_block['start'])
ATCC_block = ATCC_block.sort_values(by='start').reset_index(drop=True)

BW25113_block['start'] = pd.to_numeric(BW25113_block['start'])
BW25113_block = BW25113_block.sort_values(by='start').reset_index(drop=True)

In [4]:
ATCC_block['strain'] = 'ATCC'
BW25113_block['strain'] = 'BW25113'
K12DH_block['strain'] = 'K12DH'
K12MG_block['strain'] = 'K12MG'
block_info = pd.concat([ATCC_block, BW25113_block, K12DH_block, K12MG_block])

In [5]:
block_info.columns

Index(['block', 'taxa', 'start', 'end', 'rev_comp', 'inversion', 'strain'], dtype='object')

In [6]:
inver = block_info.groupby(['block','strain']).aggregate({'rev_comp':sum})

inver['count'] = inver.groupby('block')['rev_comp'].transform('sum')

In [7]:
table = pd.pivot_table(inver, index=['strain','count'], aggfunc=np.sum)
table

Unnamed: 0_level_0,Unnamed: 1_level_0,rev_comp
strain,count,Unnamed: 2_level_1
ATCC,0,0
ATCC,1,705
ATCC,2,4
ATCC,3,2
BW25113,0,0
BW25113,1,0
BW25113,2,4
BW25113,3,2
K12DH,0,0
K12DH,1,0


In [8]:
rev_comp = gene_num.groupby(['block','strain']).aggregate({'rev_comp':sum})
rev_comp['count'] = rev_comp.groupby(['block'])['rev_comp'].transform('sum')
pd.pivot_table(rev_comp, index=['strain','count'], aggfunc=np.sum)

NameError: name 'gene_num' is not defined

In [None]:
table['rev_comp'].sum()

In [None]:
block_info['rev_comp'].sum()

## with Gene expressions

In [None]:
df = pd.read_csv("Sample_final_df.csv")

In [None]:
df['gene_name'] = np.where(df['gene_id'].isnull()==True, df['locus_tag'], df['gene_id'])

In [None]:
df2 = df.iloc[:,0:2]

In [None]:
df.columns

In [None]:
df2 = df.loc[:,['block','start','end','rev_comp','inversion','strain','gene_name','gbk_start','gbk_midpoint','gbk_end','norm_exp']]
df2['block_length'] = df2['end'] - df2['start']

In [None]:
print('number of genes:',len(df2),'\n genes inside inverted blocks:', 
      len(df2[df2['inversion']==1]),
     '\n genes inside inverted block by strains:',
     len(df2[df2['rev_comp']==1]))

In [None]:
df2.head()

In [None]:
table= pd.pivot_table(df2, index=['block'], aggfunc={
    'gene_name':'count'}).sort_values(by='gene_name',ascending=False)
table[['gene_name']].describe() ## How many genes in each block
#related to the length of the block

In [None]:
table1 = pd.pivot_table(df2[df2['inversion']==1], index=['block'], aggfunc={
    'gene_name':'count'}).sort_values(by='gene_name',ascending=False)
table1[['gene_name']].describe() ## How many genes in each block
#related to the length of the block

In [None]:
table2 = pd.pivot_table(df2, index=['block','strain','rev_comp'], aggfunc={
    'gene_name':'count'}).sort_values(by='block',ascending=False)
table2.head(n=20)

## Number of strains in each block

In [None]:
pd.DataFrame(df2.groupby('block')['strain'].nunique())['strain'].value_counts()

In [None]:
#628 blocks in total
df2.nunique()[0] == 465+107+31+25 ==628

### number of genes 

In [None]:
df2.head()

In [None]:
temp = df2.groupby(['block']).aggregate({'strain':'nunique','gene_name':'count', 'inversion':'first',
                                        'block_length':'first'})
temp.groupby(['strain']).aggregate({'strain':'count','gene_name':sum, 'inversion':sum, 'block_length':'mean'})

In [None]:
## remove block w single strain 
## 2 & 3 strains 

In [None]:
temp.groupby(['strain']).aggregate({'strain':'count','gene_name':'mean', 'inversion':sum})

In [None]:
#subset the blocks and look at average space 

In [None]:
temp.head()

In [None]:
gene_num = df2.groupby(['block','strain']).aggregate({'start':'first','end':'first',
                                         'rev_comp':'first', 'inversion':'first', 
                                                   'gene_name':'count'}).reset_index()
gene_num['block_space'] = gene_num['end'] - gene_num['start']

In [None]:
sns.set(style="whitegrid")
#f, ax = plt.subplots(figsize=(6.5, 6.5))
#sns.despine(f, left=True, bottom=True)
sns.scatterplot(x="gene_name", y="block_space",
                hue="strain", linewidth=0,
                data=gene_num, #ax=ax, 
                alpha=0.8)
#change to line graph w confident intervals

In [None]:
sns.lineplot(x="gene_name", y="block_space",
                hue="strain",
                data=gene_num, alpha=0.5)

In [None]:
gene_num['block_space'].describe()

In [None]:
ATCC= gene_num[gene_num['strain']=='ATCC']
BW= gene_num[gene_num['strain']=='BW25113']
K12DH= gene_num[gene_num['strain']=='K12DH']
K12MG= gene_num[gene_num['strain']=='K12MG']

sns.distplot(ATCC[['block_space']],hist=False,label='ATCC')
sns.distplot(BW[['block_space']],hist=False,label='BW25113')
sns.distplot(K12DH[['block_space']],hist=False,label='K12DH')
sns.distplot(K12MG[['block_space']],hist=False,label='K12MG')

### Sort gene by order 

In [None]:
#sort by midpoint 
df3 = df2.sort_values(by=['block','strain','gbk_midpoint']).reset_index(drop=True)
#put all genes into one row
df3['genes']= df3.groupby(['block','strain']).transform(lambda x:','.join(x))
#groupby block and strain
df3 = df3.groupby(['block','strain']).aggregate({'genes':'first','rev_comp':'first'}).reset_index()
#split genes into multiple colns
genes = pd.DataFrame(df3['genes'].str.split(',',expand=True).values)
#combines 
genes_sort = pd.concat([df3,genes],axis=1)

del genes_sort['genes']

In [None]:
genes_sort.head(n=20)

### blocks

In [None]:
gene_num.head()

In [None]:
#histogram length of blocks: inverted vs non inverted
#table # of inversion & rev_comp within strain
#expression: inverted vs non-inverted && rev-comp
#spatial position of the strain

In [None]:
sns.boxplot(x="strain", y="norm_exp", hue="inversion",
                 data=df2)

In [None]:
sns.boxplot(x="strain", y="norm_exp", hue="rev_comp",
                 data=df2)

In [None]:
sns.distplot(df2['norm_exp'],hist=False,label='K12MG')

In [None]:
df2.groupby(['strain','inversion'])['norm_exp'].describe()

In [None]:
df2.groupby(['strain','rev_comp'])['norm_exp'].describe()

In [None]:
## about blocks

In [None]:
rev_comp = gene_num.groupby(['block','strain']).aggregate({'rev_comp':sum})
rev_comp['count'] = rev_comp.groupby(['block'])['rev_comp'].transform('sum')
pd.pivot_table(rev_comp, index=['strain','count'], aggfunc=np.sum)

In [None]:
df['block'].nunique()

In [None]:
gene_num['rev_comp'].sum()

In [None]:
sns.boxplot(x="rev_comp", y="norm_exp", 
            #hue="rev_comp",
                 data=df2)

## inversion position

In [None]:
df2.head()

In [None]:
sns.scatterplot(x="gbk_midpoint", y="strain",
                hue="inversion", linewidth=0,
                data=df2,
                alpha=0.8)

In [None]:
sns.scatterplot(x="gbk_midpoint", y="strain",
                hue="rev_comp", linewidth=0,
                data=df2,
                alpha=0.8)

In [None]:
pd.options.display.float_format = '{:f}'.format
df2['gbk_midpoint'].describe()

In [None]:
front = df2[df2['gbk_midpoint'] <= 2500000]

In [None]:
back = df2[df2['gbk_midpoint'] > 2500000]

In [None]:
sns.scatterplot(x="gbk_midpoint", y="strain",
                hue="inversion", linewidth=0,
                data=front,
                alpha=0.2)
#open circle

In [None]:
sns.scatterplot(x="gbk_midpoint", y="strain",
                hue="inversion", linewidth=0,
                data=back,
                alpha=0.8)