In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
%run -i '/home/ivan/scripts/test/DataScienceTools/DStools.py'

def plotColorScatter_x(DataFrame ,xvalue = '0',yvalue = '1', sizevalue = 'size', outputFilePath='/abc/test.html',plotWidth = 750, plotHeight = 750, readList = ['1','2'],titleName='tSNE', colorColumn="Category", colorPattern=viridis):
    factors = ['CD69-DP','CD69+DP','CD4+CD8low','TCR+DP','CD4SP_immature','CD4SP_mature','CD8SP']
    if len(factors)<8:
        color_map = factor_cmap(colorColumn,factors=factors,palette=['#f03b20','#feb24c','#ffeda0','#636363','#a1d99b','#31a354','#3182bd'])
    else:
        color_map = factor_cmap(colorColumn,factors=factors,palette=colorPattern(len(factors)))
    hover = HoverTool()
    tooltipString = ""
    for ele in readList:
        ele=str(ele)
        readTuple = (ele.lower(),ele)
        tooltipString = tooltipString + """<br><font face="Arial" size="4">%s: @%s<font>""" % readTuple
    hover.tooltips = tooltipString
    tools= [hover,WheelZoomTool(),PanTool(),BoxZoomTool(),ResetTool(),SaveTool()]
    source= ColumnDataSource(DataFrame)
    output_file(outputFilePath)
    p = figure(plot_width = plotWidth, plot_height = plotHeight, tools=tools,title=titleName,toolbar_location='right',x_axis_label=xvalue.lower(),y_axis_label=yvalue.lower(),background_fill_color='white',title_location = 'above')
    p.title.text_font_size='15pt'
    p.title.align = 'center'
    p.xaxis.axis_label_text_font_size='12pt'
    p.yaxis.axis_label_text_font_size='12pt'
    p.x_range = Range1d(DataFrame[xvalue].min()*1.1,DataFrame[xvalue].max()*1.1)
    p.y_range = Range1d(DataFrame[yvalue].min()*1.1,DataFrame[yvalue].max()*1.1)
    p.circle(x = xvalue,y = yvalue,size=sizevalue,source=source,color=color_map,legend=colorColumn)
    p.legend.location = "top_left"
    p.toolbar.active_scroll=p.select_one(WheelZoomTool)
    if outputFilePath.endswith('png'):
        export_png(p, filename=outputFilePath)
    else:
        output_file(outputFilePath)
        show(p)

In [2]:
#read the gene expression data
logcountsFull = pd.read_csv('/home/ivan/Desktop/Project2/MyData/forPython/logNormalWithTCR.csv', sep=',')
logcountsFull.columns=[i.upper() for i in logcountsFull.columns]
full_gene = logcountsFull.columns.tolist()
full_gene.sort()
#generate the M-set
manual_genes=['Bcl2l11','Bcl2','Bcl6','Sla','Trat1','Tespa1','Arpp21','Lck','Themis','Rorc','Rag1','Rag2','Dntt','Nr4a1',
             'Nr4a3','Egr1','Egr2','Stat1','Runx1','Runx3','Myc','Myb','Mcm3','Mcm6','Bzw2','Slc29a1','Arap2','Tuba4a','Arhgef6'
              ,'Cd7','Actn2','Plac8','Cd72','Atp8b4','Cst7','Glipr2','Stx2','Eng','Cd160','Lcn4','Tspan9',
             'Itgae','Ctsw','Nkg7','Ccr9','Cd8b1','Cd8a','Foxp3', 'Igfbp4','Tspan32','Cd4','Cd40lg','Itgb3'
             ,'Tnfrsf4','Tmem154','Zbtb7b','Tmem64','Il6ra','Tspan32','Ccr7','Ckb','Mapk11','Thy1','Lgals1','Cnp',
             'Ppm1h','Dnajc15','Epsti1','Slamf6','Smc4','Themis','Ifngr1','Ighm','Cd2','Itgb2','Eno1','Tubb5',
             'Cytip','Rgs10','Kcnn4','Samhd1','Tuba4a','Arhgef6','Ddx24','Cd69','Cd28','Cd5','Tox','Itm2a',
             'Nfatc3','Zfp36','Jun','Gata3']
manual_genes=[i.upper() for i in manual_genes]
manual_genes = list(set(manual_genes).intersection(full_gene))

#assign the population information to the expression data
factors = pd.read_csv('/home/ivan/Desktop/Project2/MyData/forPython/factors_TCR.csv', sep=',')
gene_df_full = pd.merge(logcountsFull[full_gene],factors['group'],left_index=True,right_index=True)
gene_df_full = gene_df_full[gene_df_full['group']!='lowTCR_Rest']
gene_df_full.loc[gene_df_full['group']=='WT1','group']='CD69-DP'
gene_df_full.loc[gene_df_full['group']=='WT2','group']='CD69+DP'
gene_df_full.loc[gene_df_full['group']=='WT3','group']='CD4+CD8low'
gene_df_full.loc[gene_df_full['group']=='WT4','group']='CD4SP'
gene_df_full.loc[gene_df_full['group']=='WT5','group']='CD8SP'
gene_df_full.loc[gene_df_full['group']=='highTCR_Rest','group']='TCR+DP'

#generate the F-set
seurat_degenes=pd.read_csv('/home/ivan/Desktop/Project2/MyData/pipeline/seurat_degene2.txt', sep='\t')
seurat_degenes = seurat_degenes['x'].tolist()
seurat_degenes = [gene.replace('.','-') for gene in seurat_degenes]

#generate the T-set
tfs_path = '/home/ivan/Desktop/Project2/RawData/DE_TFs/'
files = os.listdir(tfs_path) 
files = [tfs_path+f for f in files if f.endswith('.csv') if not f.startswith('.')] 
tfs_df = pd.DataFrame()
for file in files:
    temp_df = pd.read_csv(file, sep=',')
    groups = re.match(r".+_(.+)WT.+_vs_(.+)WT.+",file).groups()
    temp_df['category']=groups[0]+'*'+groups[1]
    tfs_df=tfs_df.append(temp_df,sort=False)
tf_genes=tfs_df['mgi_symbol'].unique().tolist()
tf_genes=[i.upper() for i in tf_genes]
tf_genes = list(set(tf_genes).intersection(full_gene))

#for plotting heatmaps which is not included in the thesis
filter_Cd8a=gene_df_full['CD8A']>0.1
filter_Cd8b1=gene_df_full['CD8B1']>0.1
filter_Cd4=gene_df_full['CD4']>0.1
gene_df_full.loc[filter_Cd4 & filter_Cd8a,'cd4cd8']='4+8+'
gene_df_full.loc[filter_Cd4 & ~filter_Cd8a,'cd4cd8']='4+8-'
gene_df_full.loc[~filter_Cd4 & filter_Cd8a,'cd4cd8']='4-8+'
gene_df_full.loc[~filter_Cd4 & ~filter_Cd8a,'cd4cd8']='4-8-'

#generate the random gene.
# rand_gene=[]
# i=0
# while i<100:
#     rand_gene.append(full_gene[random.randint(0,len(full_gene)-1)])
#     i=len(set(rand_gene))
# rand_gene=list(set(rand_gene))

#the random gene test generated by the prior codes.
random_genes=['TEPSIN',
 'MAN2A1',
 'RASSF1',
 'TRAF1',
 'DYRK3',
 'TMEM221',
 'IFT74',
 'SCAPER',
 'NATD1',
 'SLC25A14',
 'CRYL1',
 'C330007P06RIK',
 'KLHL36',
 'SPINDOC',
 'CUL1',
 'NCAPD3',
 'GNB4',
 'ZFP446',
 'LIN9',
 'PDZK1IP1',
 'MAPK7',
 'HMGCS1',
 'RMND5B',
 'TBCEL',
 'KLHDC8B',
 'KRT83',
 'LEO1',
 'ATP1A1',
 'CHCHD4',
 'SLAMF7',
 'NMT1',
 'MPC2',
 'MRPL51',
 'PTK2',
 'NDUFS3',
 'SNRPC',
 'RPS10',
 'HS3ST3B1',
 'RPL35',
 'MCMBP',
 'PQBP1',
 'D11WSU47E',
 'ZYG11B',
 'TEX264',
 'DHRS1',
 'HNRNPH2',
 'FAM58B',
 'NUP54',
 'TOMM40',
 'PCBP1',
 'UMPS',
 '4632411P08RIK',
 'DCLRE1C',
 'OSBPL3',
 'DENND6B',
 'JKAMP',
 'PHF1',
 'GOLGA2',
 'TGFBRAP1',
 'MDM2',
 'KDM5A',
 'SIVA1',
 'CHMP6',
 'SLC7A11',
 'GATAD2B',
 'ANAPC7',
 'POLG',
 'PLOD1',
 'SYPL',
 'AKAP1',
 'KMT2A',
 'UAP1L1',
 'NUB1',
 'RSAD1',
 'NAT1',
 'ARRDC4',
 'FAM241A',
 'CRTAM',
 'SRFBP1',
 'MLLT1',
 'TALDO1',
 'MDM1',
 'TPRA1',
 'SPCS3',
 'PSAT1',
 'EMC7',
 'ANKLE2',
 'WFS1',
 'SMOX',
 'MIEF2',
 'RDH12',
 'TRRAP',
 'UBTF',
 'EEF2',
 'RAPGEF2',
 'TSPYL3',
 'GREB1',
 'GSTK1',
 'BC017158',
 'FARSA']

In [None]:
#calculate the degene scores by using z-test and KS-test based on the original dataset.
gene_dfc=gene_df_full[gene_df_full['group']!='TCR+DP'].copy()
gene_dfc['cell']=gene_dfc.index
drode_de_gene_df = Drode_DE_gene_detection(gene_dfc,full_gene,feature='gene',id_col='cell',group_col='group',is_unify=True)
drode_de_gene_df.to_csv(
    path_or_buf="/home/ivan/Desktop/Project2/MyData/pipeline/DE_gene_fullWT4.csv", 
    sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, 
    mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', 
    chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.')

In [3]:
#read the result generate in the last cell, so that we don't need to run the last cell in every trial.
drode_de_gene_df=pd.read_csv('/home/ivan/Desktop/Project2/MyData/pipeline/DE_gene_fullWT4.csv', sep=',')
full_de_gene = drode_de_gene_df['gene'].unique().tolist()
drode_de_gene = select_de_gene(drode_de_gene_df,score_col='d0prob',feature='gene',num=20)


In [4]:
gene_dfc=gene_df_full.copy()
gene_dfc['cell']=gene_dfc.index
gene_dfc=gene_dfc.reset_index(drop=True)

#separate the CD4SP cells into immature and mature cells

lst = cluster_cells(gene_dfc,drode_de_gene,group='CD4SP',group_col='group',id_col='cell')
small_lst=lst[0:23]+[lst[len(lst)-1]]
large_lst=lst[23:len(lst)-1]
gene_dfc.index=gene_dfc['cell']
gene_dfc['WT4_sub']= gene_dfc['group']
gene_dfc.loc[small_lst,'WT4_sub']='CD4SP_immature'
gene_dfc.loc[large_lst,'WT4_sub']='CD4SP_mature'
gene_dfc['group']=gene_dfc['WT4_sub']
del gene_dfc['WT4_sub']
gene_df_full_CD4sep=gene_dfc.copy()
gene_dfc=gene_dfc[(gene_dfc['group']!='CD4SP_immature') & (gene_dfc['group']!='TCR+DP')]
print(gene_df_full_CD4sep['group'].unique())

#generate the newgroup column used to order the populations in plot.
gene_df_full_CD4sep.loc[gene_df_full_CD4sep['group']=='CD69-DP','newGroup']=0
gene_df_full_CD4sep.loc[gene_df_full_CD4sep['group']=='CD69+DP','newGroup']=1
gene_df_full_CD4sep.loc[gene_df_full_CD4sep['group']=='TCR+DP','newGroup']=2
gene_df_full_CD4sep.loc[gene_df_full_CD4sep['group']=='CD4+CD8low','newGroup']=3
gene_df_full_CD4sep.loc[gene_df_full_CD4sep['group']=='CD4SP_immature','newGroup']=4
gene_df_full_CD4sep.loc[gene_df_full_CD4sep['group']=='CD4SP_mature','newGroup']=5
gene_df_full_CD4sep.loc[gene_df_full_CD4sep['group']=='CD8SP','newGroup']=6
gene_df_full_CD4sep['newGroup']=gene_df_full_CD4sep['newGroup'].astype(int)

Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c35a6ef0>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__


['CD69-DP' 'CD69+DP' 'TCR+DP' 'CD8SP' 'CD4SP_mature' 'CD4+CD8low'
 'CD4SP_immature']


In [5]:
#describe mature CD4SP cells using marker genes.
gene_df_full_CD4sep[gene_df_full_CD4sep['group']=='CD4SP_mature'][['ITM2A','CD24A','CD69','CCR9','STAT1','CCR7','H2-Q7']].describe()

Unnamed: 0,ITM2A,CD24A,CD69,CCR9,STAT1,CCR7,H2-Q7
count,68.0,68.0,68.0,68.0,68.0,68.0,68.0
mean,1.274892,0.079028,0.481406,0.129812,2.331812,2.14628,1.352096
std,1.51226,0.457508,1.081525,0.604874,1.644345,1.544076,1.271924
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.240058,0.0,0.0
50%,0.018738,0.0,0.0,0.0,2.912103,2.845633,1.5239
75%,2.78915,0.0,0.027966,0.0,3.550524,3.442311,2.450006
max,4.227945,2.75532,4.173711,3.371386,4.960322,4.210364,3.512027


In [6]:
#describe immature CD4SP cells using marker genes.
gene_df_full_CD4sep[gene_df_full_CD4sep['group']=='CD4SP_immature'][['ITM2A','CD24A','CD69','CCR9','STAT1','CCR7','H2-Q7']].describe()

Unnamed: 0,ITM2A,CD24A,CD69,CCR9,STAT1,CCR7,H2-Q7
count,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,3.156941,0.108886,1.073761,0.81114,0.799935,1.478131,0.085873
std,1.997587,0.533429,1.468996,1.255549,1.431968,1.732499,0.42069
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.002615,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.788217,0.0,0.0,0.0,0.0,0.026791,0.0
75%,4.411402,0.0,2.414005,1.868125,1.020027,3.296202,0.0
max,5.514202,2.613257,4.078529,3.369131,4.994376,4.131553,2.060953


In [None]:
#calculate the degene scores by using z-test and KS-test based on the CD4SP separated dataset.
drode_de_gene_df = Drode_DE_gene_detection(gene_dfc,full_gene,feature='gene',id_col='cell',group_col='group',is_unify=False)
drode_de_gene_df.to_csv(
    path_or_buf="/home/ivan/Desktop/Project2/MyData/pipeline/DE_gene_matureWT4_ununifyied.csv", 
    sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, 
    mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', 
    chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.')

In [8]:
#read the result generate in the last cell, so that we don't need to run the last cell in every trial.
drode_de_gene_df=pd.read_csv('/home/ivan/Desktop/Project2/MyData/pipeline/DE_gene_matureWT4_ununifyied.csv', sep=',')
full_de_gene = drode_de_gene_df['gene'].unique().tolist()
#generate the Z-set
drode_de_gene_df=drode_de_gene_df[(drode_de_gene_df['group_1']!='TCR+DP') & (drode_de_gene_df['group_2']!='TCR+DP')].copy()
drode_de_gene = select_de_gene(drode_de_gene_df,score_col='d0prob',feature='gene',num=21)
#generate the K-set
ks_de_gene = select_de_gene(drode_de_gene_df,score_col='d1prob',feature='gene',num=27)
#save the Z-set for Slingshot
pd.DataFrame(drode_de_gene).to_csv(
    path_or_buf="/home/ivan/Desktop/Project2/MyData/pipeline/degenes_drode.csv", 
    sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, 
    mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', 
    chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.')


102

In [9]:
#find the commen and unique genes between K-set and Z-set
distinct_degenes = findDistinct(drode_de_gene,seurat_degenes)[1]
intersect_degene_ks=set(drode_de_gene).intersection(ks_de_gene)
len(intersect_degene_ks)
# gene_df_full_CD4sep['cd4cd8']



17

In [None]:
#This is for gene expression heat map generation, which is not included in the thesis.
pca=PCA(n_components=5)
pca.fit_transform(gene_df_full_CD4sep[seurat_degenes])

temp_df = pd.DataFrame(pca.components_).T
temp_df.columns= ['pc1', 'pc2','pc3','pc4','pc5']
temp_df.index = seurat_degenes

temp_df['gene']=temp_df.index
for i in ['pc1', 'pc2','pc3','pc4','pc5']:
    temp_df[i+'_abs']=abs(temp_df[i])
    temp_df = temp_df.sort_values(by=i+'_abs',ascending=False)
    temp_df = temp_df.reset_index(drop=True)
    temp_df['rank-'+i]=temp_df.index+1001
    temp_df.loc[temp_df[i]>0,'status']='pos'
    temp_df.loc[temp_df[i]<0,'status']='neg'
    temp_df['rank-'+i]=temp_df['rank-'+i].astype(str)+'_'+i+'_'+temp_df['status']
    del temp_df['status']
gene_cate_df = temp_df[['gene','pc1', 'pc2','pc3','pc4','pc5','pc1_abs', 'pc2_abs','pc3_abs','pc4_abs','pc5_abs']].copy()
gene_cate_df.index=gene_cate_df['gene']
del gene_cate_df['gene']
gene_cate_df['max'] = np.array(gene_cate_df[['pc1_abs', 'pc2_abs','pc3_abs','pc4_abs','pc5_abs']].T.describe().T['max'].tolist())
marker_filter = pd.notnull(gene_cate_df['max'])
gene_cate_df.loc[(gene_cate_df['max']==abs(gene_cate_df['pc1'])) & marker_filter & (gene_cate_df['pc1']>=0),'markers']='pc1_pos'
gene_cate_df.loc[(gene_cate_df['max']==abs(gene_cate_df['pc1'])) & marker_filter & (gene_cate_df['pc1']<0),'markers']='pc1_neg'
gene_cate_df.loc[(gene_cate_df['max']==abs(gene_cate_df['pc2'])) & marker_filter & (gene_cate_df['pc2']>=0),'markers']='pc2_pos'
gene_cate_df.loc[(gene_cate_df['max']==abs(gene_cate_df['pc2'])) & marker_filter & (gene_cate_df['pc2']<0),'markers']='pc2_neg'
gene_cate_df.loc[(gene_cate_df['max']==abs(gene_cate_df['pc3'])) & marker_filter & (gene_cate_df['pc3']>=0),'markers']='pc3_pos'
gene_cate_df.loc[(gene_cate_df['max']==abs(gene_cate_df['pc3'])) & marker_filter & (gene_cate_df['pc3']<0),'markers']='pc3_neg'
gene_cate_df.loc[(gene_cate_df['max']==abs(gene_cate_df['pc4'])) & marker_filter & (gene_cate_df['pc4']>=0),'markers']='pc4_pos'
gene_cate_df.loc[(gene_cate_df['max']==abs(gene_cate_df['pc4'])) & marker_filter & (gene_cate_df['pc4']<0),'markers']='pc4_neg'
gene_cate_df.loc[(gene_cate_df['max']==abs(gene_cate_df['pc5'])) & marker_filter & (gene_cate_df['pc5']>=0),'markers']='pc5_pos'
gene_cate_df.loc[(gene_cate_df['max']==abs(gene_cate_df['pc5'])) & marker_filter & (gene_cate_df['pc5']<0),'markers']='pc5_neg'
gene_cate_df=gene_cate_df.sort_values(by=['markers','max'],ascending=[True,True])
pc_cate_features=[]
pc_features=[]
cate_lst = ['pc1_pos','pc1_neg','pc2_pos','pc2_neg','pc3_pos','pc3_neg','pc4_pos','pc4_neg','pc5_pos','pc5_neg']
for i in cate_lst:
    filter_i = gene_cate_df['markers']==i
    g_lst = gene_cate_df[filter_i].index.tolist()
    pc_features = g_lst+pc_features
    g_lst = [i]+g_lst
    pc_cate_features= g_lst + pc_cate_features
real_fake_features = pc_cate_features
real_features = pc_features
Image.warnings.simplefilter('ignore', Image.DecompressionBombWarning)
generate_GE_heatmap_bilevel(gene_df_full_CD4sep,real_features,real_fake_features,first_cate='newGroup',second_cate='cd4cd8',sort_col='CD4',is_scale=False,path_file='/home/ivan/Desktop/Project2/MyData/pipeline/HPf-set.png',x_size=25000,y_size=6500)

In [11]:
#Plot the line chart for RUEs and PELs based on 'ITM2A','PIK3CD','RGS10','SLC16A5','STAT1'
gene_expression_level_df,gene_expression_rate_df = de_gene_description(drode_de_gene_df[drode_de_gene_df['gene'].isin(['ITM2A','PIK3CD','RGS10','SLC16A5','STAT1'])])
gene_expression_level_df = gene_expression_level_df.loc[['CD69-DP','CD69+DP','CD4+CD8low','CD4SP','CD8SP']]
gene_expression_rate_df = gene_expression_rate_df.loc[['CD69-DP','CD69+DP','CD4+CD8low','CD4SP','CD8SP']]
plot_line_charts(gene_expression_level_df,df2=gene_expression_rate_df,width=1400,height=900,titleName='Gene Expression Rates and Levels',xlabel='subpopulations',ylabel='PELs',ylabel_sec='1-RUEs',outputFilePath = "/home/ivan/Desktop/Project2/MyData/pipeline/rate_and_level.html")




Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c2ae28d0>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__


In [12]:
#pca plots
principalDf,pca_description= pca_transformation(gene_df_full_CD4sep,drode_de_gene)
for i in range(1,3):
    for j in range(i+1,4):
        plotColorScatter_x(principalDf ,xvalue = 'pc'+str(i),yvalue = 'pc'+str(j), sizevalue = 10, \
                         outputFilePath='/home/ivan/Desktop/Project2/MyData/pipeline/'+'seurat-'+'pca'+str(i)+'-'+str(j)+'.html',\
                         plotWidth = 1000, plotHeight =1000, readList = ['cell','group','pc'+str(i),'pc'+str(j)],\
                         titleName='pc'+str(i)+' vs pc'+str(j), colorColumn="group")

Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c34c2080>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__
Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c3493400>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__
Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c349a710>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__


In [13]:
#plot the scatter plot for PCs
principalDf,pca_description= pca_transformation(gene_df_full_CD4sep,drode_de_gene)
principalDf.loc[principalDf['group']=='CD69-DP','Subpopulations']=1-np.random.random(principalDf[principalDf['group']=='CD69-DP'].shape[0])*0.8
principalDf.loc[principalDf['group']=='CD69+DP','Subpopulations']=2-np.random.random(principalDf[principalDf['group']=='CD69+DP'].shape[0])*0.8
principalDf.loc[principalDf['group']=='CD4+CD8low','Subpopulations']=3-np.random.random(principalDf[principalDf['group']=='CD4+CD8low'].shape[0])*0.8
principalDf.loc[principalDf['group']=='CD4SP_immature','Subpopulations']=4-np.random.random(principalDf[principalDf['group']=='CD4SP_immature'].shape[0])*0.8
principalDf.loc[principalDf['group']=='CD4SP_mature','Subpopulations']=5-np.random.random(principalDf[principalDf['group']=='CD4SP_mature'].shape[0])*0.8
principalDf.loc[principalDf['group']=='CD8SP','Subpopulations']=6-np.random.random(principalDf[principalDf['group']=='CD8SP'].shape[0])*0.8
principalDf.loc[principalDf['group']=='TCR+DP','Subpopulations']=7-np.random.random(principalDf[principalDf['group']=='TCR+DP'].shape[0])*0.8
principalDf=principalDf.reset_index(drop=True)
for i in ['pc1','pc2','pc3','pc4']:
    plotColorScatter_x(principalDf ,xvalue = 'Subpopulations',yvalue = i, sizevalue = 5, outputFilePath='/home/ivan/Desktop/Project2/MyData/pipeline/'+i+'.html',plotWidth = 1000, plotHeight = 750, readList = ['group'],titleName=i, colorColumn="group", colorPattern=viridis)


Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c2af3a58>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__
Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb138286a90>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__
Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c27b0048>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__
Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c2aea5c0>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__


In [14]:
#showing the DE genes which contribute to the PC1, PC2, PC3 respectively, this part is not included in the thesis
h_bar_plot(pca_description.sort_values(by='rank-pc1')[0:20].copy(),'gene','pc1','rank-pc1',bottom_up=False,height=600,title='Gene Contribution for PC1',file_path='/home/ivan/Desktop/Project2/MyData/pipeline/pc1_genes.html')
h_bar_plot(pca_description.sort_values(by='rank-pc2')[0:20].copy(),'gene','pc2','rank-pc2',bottom_up=False,height=600,title='Gene Contribution for PC2',file_path='/home/ivan/Desktop/Project2/MyData/pipeline/pc2_genes.html')
h_bar_plot(pca_description.sort_values(by='rank-pc3')[0:20].copy(),'gene','pc3','rank-pc3',bottom_up=False,height=600,title='Gene Contribution for PC3',file_path='/home/ivan/Desktop/Project2/MyData/pipeline/pc3_genes.html')

Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c34a7160>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__
Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c2adc278>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__
Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb0c2aead30>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__


In [None]:
#evaluate gene sets based on classification models
gene_for_transformation = list(set(random_genes))
principalDf,pca_description= pca_transformation(gene_df_full_CD4sep,gene_for_transformation)
_ = DT_RF_models(principalDf[(principalDf['group']!='TCR+DP') & (principalDf['group']!='CD4SP_immature')]
             ,['pc1','pc2','pc3'],'/home/ivan/Desktop/Project2/MyData/pipeline/'
             ,isDT = False,iteration=20,testSize =0.2,readList = ['cell','group'], 
             label = 'group',DTdenotion='test',DT_maxdepth=2,numberOfTrees = 50,
             RF_maxdepth=3,isplot=False,id_column='cell',handle_unbalance=False)

In [17]:
#BXVP models training and predicting.
xgb_path = "/home/ivan/Desktop/Project2/MyData/pipeline/xgboost/random/"
select_major = list(random_genes)
rna_df = gene_df_full_CD4sep[['cell','group']+select_major].copy()
rna_df=rna_df.reset_index(drop=True)
WT1245_df = rna_df[(rna_df['group']!='CD4SP_immature') & (rna_df['group']!='TCR+DP')].copy()
WT2_df = rna_df[(rna_df['group']=='CD4SP_immature') | (rna_df['group']=='TCR+DP')].copy()
training_X2,testing_X2,training_Y2,testing_Y2 = cross_validation_split_with_unbalance_data(WT2_df,select_major,label='group',id_column='cell',test_size=0.2,handle_unbalance=False)
training_X2['group']=training_Y2
testing_X2['group']=testing_Y2
for i in range(0,60,1):
    training_X1,testing_X1,training_Y1,testing_Y1 = cross_validation_split_with_unbalance_data(WT1245_df,select_major,label='group',id_column='cell',test_size=0.2,handle_unbalance=False)
    training_X1['group']=training_Y1
    testing_X1['group']=testing_Y1
    testing_X1 = testing_X1.append(training_X2)
    testing_X1 = testing_X1.append(testing_X2)
    df_result = combined_eXGBT_classifier(training_set = training_X1,numeric_features_validation = select_major,testing_set = testing_X1,label_column = 'group',max_depth=2,num_trees=50)
    full_result,clean_result = transform_predict_result_DF(predict_result_DF =df_result, label_col='group', threshold=0.1)
    full_result[['group','CD69-DP','CD69+DP','CD4+CD8low','CD4SP_mature','CD8SP','multi_eXGBT_pre_label', 'predict_result_DF_indices','F_label']].to_csv(
    path_or_buf=xgb_path+str(i)+".csv", 
    sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, 
    mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', 
    chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.')

In [18]:
#trajectory plotting
xgb_path = "/home/ivan/Desktop/Project2/MyData/pipeline/xgboost/random/"
files = os.listdir(xgb_path) 
files = [xgb_path+f for f in files if f.endswith('.csv') if not f.startswith('.') if not f.startswith('f')] 
xgb_df = pd.DataFrame()
for file in files:
    temp_df = pd.read_csv(file, sep=',')
    xgb_df=xgb_df.append(temp_df,sort=False)
xgb_df['cell']=xgb_df.index
score_df= xgb_df[['cell','group','CD69-DP','CD4SP_mature','CD8SP','multi_eXGBT_pre_label', 'predict_result_DF_indices','F_label']].copy()
score_df['size']=score_df['CD69-DP']*15+3
assert len(score_df['predict_result_DF_indices'].unique())==len(rna_df['cell'].unique())
score_df.to_csv(
    path_or_buf=xgb_path+"full_result.csv", 
    sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, 
    mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', 
    chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.')
temp_df = score_df[(score_df['group']!='TCR+DP') & (score_df['group']!='CD4SP_immature')].copy()
temp_df['correct']=temp_df['group']==temp_df['F_label']
print('precision: '+str(temp_df[temp_df['correct']==True].shape[0]/(temp_df[temp_df['correct']==False].shape[0]+temp_df[temp_df['correct']==True].shape[0])))
plot_df_i = score_df.groupby(by=['group','predict_result_DF_indices'],as_index=False).mean()
plot_df_i['CD4SP_mature']=plot_df_i['CD4SP_mature']-plot_df_i['CD69-DP']+1
plot_df_i['CD8SP']=plot_df_i['CD8SP']-plot_df_i['CD69-DP']+1
plot_df_i['CD8SP']=plot_df_i['CD8SP']/plot_df_i['CD8SP'].max()
plot_df_i['CD4SP_mature']=plot_df_i['CD4SP_mature']/plot_df_i['CD4SP_mature'].max()
plotColorScatter_x(plot_df_i ,xvalue = 'CD4SP_mature',yvalue = 'CD8SP', sizevalue =10, outputFilePath='/home/ivan/Desktop/Project2/MyData/pipeline/plot_df_avg.html',plotWidth = 1000, plotHeight = 1000, readList = ['group','cell','CD4SP_mature','CD8SP'],titleName='trajectories of T cell differentiation', colorColumn="group")


precision: 0.40694444444444444


Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x7fb138286fd0>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/subprocess.py", line 766, in __del__
