In [2]:
library(Matrix)
library(xgboost)
library("caret")

# load data

In [1]:
########### download raw data
### Download 10x snATAC-seq raw data from NCBI Gene Expression Omnibus (GSE169453)
### Download 10x multiome raw data from NCBI Gene Expression Omnibus (GSE200044)

########### Download processed snATAC-seq and multiome data to reproduce figures
### There several ways to download processed data.
###1, download files less than 25M from github data folder: https://github.com/gaoweiwang/Islet_snATACseq
### or, from figshare:
### multiome: https://figshare.com/articles/dataset/processed_multiome_zip/19497665
### snATACseq: https://figshare.com/articles/dataset/processed_snATACseq_zip/19497656

###2, download large processed or intermeidate files:
#snATAC-seq data (http://169.228.232.194/~mmallick/o/processed_snATACseq.tar.gz)
#multiome data (http://169.228.232.194/~mmallick/o/processed_multiome.tar.gz)

########### change working directory
wd_snATAC = '/oasis/tscc/scratch/gaw006/snATACseq/processed/'
wd_multiome = '/oasis/tscc/scratch/gaw006/multiome/processed/'

In [3]:
rm(list = ls())
wd_snATAC = '/oasis/tscc/scratch/gaw006/snATACseq/processed/'
wd_multiome = '/oasis/tscc/scratch/gaw006/multiome/processed/'

# read in matrix data using the Matrix package
indata <- Matrix::readMM(paste0(wd_snATAC,'snATAC_500bp.mtx')) 
indata@x[indata@x > 0] <- 1
dim(indata)

# format cell info
cellinfo <- read.table(paste0(wd_snATAC,'snATAC_500bp.barcodes'))
cell_ID <- as.character(cellinfo$V1)

# format peak info
peakinfo <- read.table(paste0(wd_snATAC,'snATAC_500bp.regions'))
peak_ID <- as.character(peakinfo$V1)

rownames(indata)=cell_ID
colnames(indata)=peak_ID

indata[1:4,1:4]

4 x 4 sparse Matrix of class "dgTMatrix"
                        1:100000097-100000597 1:100001226-100001726
JYH792_AAACGAAAGCATTGGG                     .                     .
JYH792_AAACGAAAGGGTCTGA                     .                     .
JYH792_AAACGAAAGTAGACCG                     .                     .
JYH792_AAACGAACAAATAGTG                     .                     .
                        1:100009946-100010446 1:100014513-100015013
JYH792_AAACGAAAGCATTGGG                     .                     .
JYH792_AAACGAAAGGGTCTGA                     .                     .
JYH792_AAACGAAAGTAGACCG                     .                     .
JYH792_AAACGAACAAATAGTG                     .                     .

In [4]:
keep_gender=!(grepl("X", peak_ID) | grepl("Y", peak_ID))
indata=indata[,keep_gender]
dim(indata)
length(peak_ID[(grepl("X", peak_ID) | grepl("Y", peak_ID))])

In [5]:
#### barcode annotation
M=M=read.csv(paste0(wd_snATAC,'barcode_info.csv'))

M_donor=as.character(M$donor)
M_leiden=as.numeric(M$leiden)

C=read.csv(paste0(wd_snATAC,'meta_data.csv'))
###subset
C=C[(as.character(C$donor) %in% M_donor),]

donor_all=as.character(C$donor)
disease_all=as.character(C$Sample.Description.Name)
donor_ND=donor_all[disease_all=='Non']
donor_PD=donor_all[disease_all=='Pre']
donor_T2D=donor_all[disease_all=='T2D']

M_disease=M_donor
M_disease[M_donor %in% donor_ND]='ND'
M_disease[M_donor %in% donor_PD]='PD'
M_disease[M_donor %in% donor_T2D]='T2D'

M_celltype=M_donor
alpha_ID=c(0,4,8)
beta_ID=c(1,2,3,5)
delta_ID=c(6)
gamma_ID=c(10)
acinar_ID=c(7)
ductal_ID=c(9)
stellate_ID=c(11)
endothelial_ID=c(13)
immune_ID=c(12)
M_celltype[M_leiden %in% alpha_ID]='alpha'
M_celltype[M_leiden %in% beta_ID]='beta'
M_celltype[M_leiden %in% delta_ID]='delta'
M_celltype[M_leiden %in% gamma_ID]='gamma'
M_celltype[M_leiden %in% acinar_ID]='acinar'
M_celltype[M_leiden %in% ductal_ID]='ductal'
M_celltype[M_leiden %in% stellate_ID]='stellate'
M_celltype[M_leiden %in% endothelial_ID]='endothelial'
M_celltype[M_leiden %in% immune_ID]='immune'

M$celltype=M_celltype
M$disease=M_disease
M[1:3,]

index,donor,duplicated_reads,frac_duplicated_reads,frac_mito_reads,frac_promoters_used,frac_reads_in_peaks,frac_reads_in_promoters,log10_n_counts,log10_n_peaks,⋯,reads_in_promoters,total_sequenced_reads,tss_used,unique_mito_reads,unique_usable_reads,leiden,n_count_500bp,n_peak_500bp,celltype,disease
JYH792_AAACGAAAGCATTGGG,JYH792,1805,0.1117855,0.0066071554,0.09164685,0.4976297,0.2753131,4.158,3.832892,⋯,3891,16147,1773,94,14133,4,7277,3541,alpha,ND
JYH792_AAACGAAAGGGTCTGA,JYH792,1904,0.123404,0.0007437709,0.06606017,0.4494976,0.2059546,4.133858,3.802979,⋯,2767,15429,1278,10,13435,3,6125,3008,beta,ND
JYH792_AAACGAAAGTAGACCG,JYH792,2062,0.1829474,0.0297566372,0.0361315,0.3530954,0.1631513,3.944828,3.632963,⋯,1431,11271,699,269,8771,4,3309,1712,alpha,ND


In [6]:
indata=indata[(rownames(indata) %in% as.character(M$index)),]
all(as.character(M$index)==rownames(indata))
dim(indata)

########################################
######### using beta cell as example
keep_cell=(M$celltype=='beta')
P=read.csv(paste0(wd_snATAC,'peak_celltype_500_all.csv'))
P_all=as.character(P$X)
P_beta=P_all[as.numeric(P$beta)>0]
length(P_beta)
keep_peak=(colnames(indata) %in% P_beta)

data_use=indata[keep_cell,keep_peak]
M_use=M[keep_cell,]
dim(data_use)
all(rownames(data_use)==as.character(M_use$index))

In [7]:
row_sum=Matrix::rowSums(data_use)
col_sum=Matrix::colSums(data_use)
min(row_sum)
min(col_sum)


In [8]:
## remove cells with <1000 reads and peaks counted in <100 cells
keep_cell=(row_sum>1000)
keep_peak=(col_sum>100)

data_use=data_use[keep_cell,keep_peak]
M_use=M_use[keep_cell,]
dim(data_use)
all(rownames(data_use)==as.character(M_use$index))

# Model training and testing (each donor)

In [11]:
## all donors
donor_all=c(donor_ND,donor_PD,donor_T2D) #########
donor_all

In [14]:
## 
temp_ND=rep(-1,dim(M_use)[1])
temp_PD=rep(-1,dim(M_use)[1])
temp_T2D=rep(-1,dim(M_use)[1])

## test one donor each time, and using amm remianding donors to train the model
for (i in 1:length(donor_all)){
    keep_train=(as.character(M_use$donor)!=donor_all[i])
    train.x = data_use[keep_train,]
    test.x = data_use[!keep_train,]
    
    train_disease=as.character(M_use$disease)
    train_d=rep(0,length(train_disease))
    train_d[train_disease=='T2D']=2   #######
    train_d[train_disease=='PD']=1
    train_d[train_disease=='ND']=0  #######
    train.y = train_d[keep_train]
    test.y = train_d[!keep_train]
    
    train.x1=as(train.x, "dgCMatrix")
    bst <- xgboost(data = train.x1, label = train.y, max.depth = 60, eta = 0.2, nthread = 40, nrounds = 80, objective = "multi:softprob",num_class = 3)
    test.x1=as(test.x, "dgCMatrix")
    pred <- predict(bst, test.x1)
    pre_M=matrix(pred, length(pred)/3, ncol=3, byrow=TRUE)

    temp_ND[!keep_train]=pre_M[,1]
    temp_PD[!keep_train]=pre_M[,2]
    temp_T2D[!keep_train]=pre_M[,3]

    M_use$pre_ND=temp_ND #######
    M_use$pre_PD=temp_PD
    M_use$pre_T2D=temp_T2D
    
    ## save results
    #write.csv(M_use,paste0(wd_snATAC,'AI_beta_3group.csv')) #######

}

[1]	train-merror:0.071781 
[2]	train-merror:0.043275 
[3]	train-merror:0.030988 
[4]	train-merror:0.024199 
[5]	train-merror:0.019461 
[6]	train-merror:0.014769 
[7]	train-merror:0.011810 
[8]	train-merror:0.009000 
[9]	train-merror:0.006982 
[10]	train-merror:0.005373 
[11]	train-merror:0.004409 
[12]	train-merror:0.003536 
[13]	train-merror:0.002709 
[14]	train-merror:0.001848 
[15]	train-merror:0.001428 
[16]	train-merror:0.000997 
[17]	train-merror:0.000771 
[18]	train-merror:0.000578 
[19]	train-merror:0.000397 
[20]	train-merror:0.000283 
[21]	train-merror:0.000249 
[22]	train-merror:0.000193 
[23]	train-merror:0.000113 
[24]	train-merror:0.000091 
[25]	train-merror:0.000023 
[26]	train-merror:0.000011 
[27]	train-merror:0.000000 
[28]	train-merror:0.000000 
[29]	train-merror:0.000000 
[30]	train-merror:0.000000 
[31]	train-merror:0.000000 
[32]	train-merror:0.000000 
[33]	train-merror:0.000000 
[34]	train-merror:0.000000 
[35]	train-merror:0.000000 
[36]	train-merror:0.000000 
[