# Midterm

Paper: PMID 27667667
Data link (available in paper): https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-5061

From E-MTAB-5061.idf.txt:
Format of the datafile ‘pancreas_refseq_rpkms_counts_3514sc.txt’: 
The file contains both the normalized rpkm values and the raw read counts 
for each sample. Columns correspond to samples and rows to genes. 
The first line of the file (starting with: #samples) contains the sample IDs 
to be used as column labels for both the rpkm and counts.
The columns of the rpkm and the counts have the same order with the sample IDs.
Columns 1:3514 correspond to rpkm values, Columns 3515:7028 correspond to read counts.
Rows 1:26179 correspond to data for RefSeq genes, Rows 26180:26271 correspond 
to data for the 92 external RNA spike-in controls (ERCCs), 
Row 26272 (last) contains data for ‘eGFP’.

In [109]:
library(dplyr)
library(Seurat)
library(patchwork)

Loading required package: SeuratObject

Loading required package: sp


Attaching package: ‘SeuratObject’


The following objects are masked from ‘package:base’:

    intersect, t




## Getting data

In [1]:
cell.barcodes <- scan(text = readLines("../data/E-MTAB-5061/pancreas_refseq_rpkms_counts_3514sc.txt", 1), 
                      what = "", quiet = TRUE)[-1] # 3514 cell barcodes
head(cell.barcodes)

In [2]:
count = read.table('../data/E-MTAB-5061/pancreas_refseq_rpkms_counts_3514sc.txt',sep='\t')
head(count)

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V7021,V7022,V7023,V7024,V7025,V7026,V7027,V7028,V7029,V7030
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,SGIP1,NM_032291,0,0,0,0,0,0.0,0,0.0,⋯,0,0,0,7,0,4,0,0,0,0
2,AZIN2,NM_052998+NM_001293562,0,0,0,0,0,41.51929,0,0.0,⋯,0,0,0,16,0,0,0,0,0,4
3,CLIC4,NM_013943,0,0,0,0,0,0.0,0,0.0,⋯,2,20,64,18,0,0,0,0,0,1
4,AGBL4,NM_032785,0,0,0,0,0,0.0,0,0.0,⋯,0,0,0,0,0,0,5,0,0,0
5,NECAP2,NM_001145277+NM_001145278+NM_018090,0,0,0,0,0,13.99272,0,18.92435,⋯,0,0,0,14,0,1,47,7,0,6
6,SLC45A1,NM_001080397,0,0,0,0,0,149.80023,0,78.07885,⋯,0,0,0,0,0,0,1,0,0,1


In [3]:
gene.meta=count[,1:2] # First two columns are gene symbols and IDs
head(gene.meta)

Unnamed: 0_level_0,V1,V2
Unnamed: 0_level_1,<chr>,<chr>
1,SGIP1,NM_032291
2,AZIN2,NM_052998+NM_001293562
3,CLIC4,NM_013943
4,AGBL4,NM_032785
5,NECAP2,NM_001145277+NM_001145278+NM_018090
6,SLC45A1,NM_001080397


Remove metadata from the `count`.

In [4]:
count=count[,-(1:2)]
head(count)

Unnamed: 0_level_0,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,⋯,V7021,V7022,V7023,V7024,V7025,V7026,V7027,V7028,V7029,V7030
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,0,0,0,0,0,0.0,0,0.0,0.0,0.0,⋯,0,0,0,7,0,4,0,0,0,0
2,0,0,0,0,0,41.51929,0,0.0,0.0,0.0,⋯,0,0,0,16,0,0,0,0,0,4
3,0,0,0,0,0,0.0,0,0.0,5.463685,0.2308227,⋯,2,20,64,18,0,0,0,0,0,1
4,0,0,0,0,0,0.0,0,0.0,0.0,0.0,⋯,0,0,0,0,0,0,5,0,0,0
5,0,0,0,0,0,13.99272,0,18.92435,44.923718,0.0,⋯,0,0,0,14,0,1,47,7,0,6
6,0,0,0,0,0,149.80023,0,78.07885,0.0,0.0,⋯,0,0,0,0,0,0,1,0,0,1


We only care about read counts, not rpkm values.

In [5]:
count=as.matrix(count[,3515:7028]) # Columns 3515:7028 correspond to read counts.
head(count, 3)

## Labeling rows and columns 

We add column names and row names with genes and samples.

In [6]:
rownames(count)=gene.meta[,1]
colnames(count)=cell.barcodes
head(count)

Unnamed: 0,HP1502401_N13,HP1502401_D14,HP1502401_F14,HP1502401_J13,HP1502401_B13,HP1502401_H13,HP1502401_J14,HP1502401_B14,HP1502401_A14,HP1502401_C14,⋯,HP1525301T2D_O10,HP1526901T2D_H2,HP1526901T2D_I16,HP1526901T2D_F7,HP1526901T2D_I23,HP1525301T2D_K3,HP1525301T2D_J10,HP1526901T2D_N8,HP1526901T2D_O11,HP1526901T2D_A8
SGIP1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,7,0,4,0,0,0,0
AZIN2,0,0,0,0,0,3,0,0,0,0,⋯,0,0,0,16,0,0,0,0,0,4
CLIC4,0,0,0,0,0,0,0,0,14,1,⋯,2,20,64,18,0,0,0,0,0,1
AGBL4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,5,0,0,0
NECAP2,0,0,0,0,0,1,0,10,53,0,⋯,0,0,0,14,0,1,47,7,0,6
SLC45A1,0,0,0,0,0,13,0,52,0,0,⋯,0,0,0,0,0,0,1,0,0,1


In [7]:
rm(gene.meta, cell.barcodes)

In [27]:
dim(t(count))

In [26]:
dim(cell.meta)

## Ordering cells to match `cell.meta`

In [8]:
cell.meta = read.csv('../data/E-MTAB-5061/E-MTAB-5061.sdrf.txt',sep='\t')
head(cell.meta)

Unnamed: 0_level_0,Source.Name,Characteristics..organism.,Characteristics..individual.,Characteristics..sex.,Characteristics..age.,Unit..time.unit.,Characteristics..body.mass.index.,Characteristics..organism.status.,Characteristics..clinical.information.,Characteristics..organism.part.,⋯,Comment.ENA_EXPERIMENT.,Scan.Name,Comment.SUBMITTED_FILE_NAME.,Comment.ENA_RUN.,Comment.FASTQ_URI.,Protocol.REF.3,Derived.Array.Data.File,Comment..Derived.ArrayExpress.FTP.file.,FactorValue..single.cell.identifier.,Factor.Value..disease.
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,AZ_A1,Homo sapiens,H1,male,43,year,30.8,post-mortem,HbA1c 5.0%,pancreas,⋯,ERX1700346,AZ_A1.fastq.gz,AZ_A1.fastq.gz,ERR1630013,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/003/ERR1630013/ERR1630013.fastq.gz,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-5061/E-MTAB-5061.processed.1.zip,AZ_A1,normal
2,AZ_A3,Homo sapiens,H1,male,43,year,30.8,post-mortem,HbA1c 5.0%,pancreas,⋯,ERX1700351,AZ_A3.fastq.gz,AZ_A3.fastq.gz,ERR1630018,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/008/ERR1630018/ERR1630018.fastq.gz,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-5061/E-MTAB-5061.processed.1.zip,AZ_A3,normal
3,AZ_A4,Homo sapiens,H1,male,43,year,30.8,post-mortem,HbA1c 5.0%,pancreas,⋯,ERX1700352,AZ_A4.fastq.gz,AZ_A4.fastq.gz,ERR1630019,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/009/ERR1630019/ERR1630019.fastq.gz,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-5061/E-MTAB-5061.processed.1.zip,AZ_A4,normal
4,AZ_B6,Homo sapiens,H1,male,43,year,30.8,post-mortem,HbA1c 5.0%,pancreas,⋯,ERX1700366,AZ_B6.fastq.gz,AZ_B6.fastq.gz,ERR1630033,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/003/ERR1630033/ERR1630033.fastq.gz,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-5061/E-MTAB-5061.processed.1.zip,AZ_B6,normal
5,AZ_B7,Homo sapiens,H1,male,43,year,30.8,post-mortem,HbA1c 5.0%,pancreas,⋯,ERX1700367,AZ_B7.fastq.gz,AZ_B7.fastq.gz,ERR1630034,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/004/ERR1630034/ERR1630034.fastq.gz,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-5061/E-MTAB-5061.processed.1.zip,AZ_B7,normal
6,AZ_C12,Homo sapiens,H1,male,43,year,30.8,post-mortem,HbA1c 5.0%,pancreas,⋯,ERX1700373,AZ_C12.fastq.gz,AZ_C12.fastq.gz,ERR1630040,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/000/ERR1630040/ERR1630040.fastq.gz,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-5061/E-MTAB-5061.processed.1.zip,AZ_C12,normal


In [10]:
table(cell.meta$Characteristics..individual.) # Six healthy individuals, four T2D patients


  H1   H2   H3   H4   H5   H6 T2D1 T2D2 T2D3 T2D4 
  96  352  383  383  383  383  383  383  384  384 

In [11]:
cell.meta=cell.meta[,1:5] # Remove unnecessary meta info for the cells
head(cell.meta)

In [60]:
df = t(count)
df = df[cell.meta$Source.Name,]
head(df)

Unnamed: 0,SGIP1,AZIN2,CLIC4,AGBL4,NECAP2,SLC45A1,TGFBR3,DBT,RFWD2,C1orf21,⋯,ERCC_1.83105469:mix1_3.66210938:mix2,ERCC_0.91552734:mix1_1.83105469:mix2,ERCC_0.91552734:mix1_1.83105469:mix2.1,ERCC_0.45776367:mix1_0.91552734:mix2,ERCC_0.22888184:mix1_0.45776367:mix2,ERCC_0.22888184:mix1_0.45776367:mix2.1,ERCC_0.11444092:mix1_0.22888184:mix2,ERCC_0.05722046:mix1_0.11444092:mix2,ERCC_0.01430512:mix1_0.02861023:mix2,eGFP
AZ_A1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AZ_A3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,1
AZ_A4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AZ_B6,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,1
AZ_B7,0,0,0,0,0,0,0,0,0,0,⋯,0,1,0,0,0,0,0,0,0,0
AZ_C12,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [61]:
sum(cell.meta$Source.Name == rownames(df))

In [62]:
dim(cell.meta)

## Choosing only healthy individuals

Let's find some `Source.Name` that match healthy individuals:

In [63]:
head(cell.meta)

Unnamed: 0_level_0,Source.Name,Characteristics..organism.,Characteristics..individual.,Characteristics..sex.,Characteristics..age.
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>
1,AZ_A1,Homo sapiens,H1,male,43
2,AZ_A3,Homo sapiens,H1,male,43
3,AZ_A4,Homo sapiens,H1,male,43
4,AZ_B6,Homo sapiens,H1,male,43
5,AZ_B7,Homo sapiens,H1,male,43
6,AZ_C12,Homo sapiens,H1,male,43


In [86]:
healthy_cell.meta <- cell.meta %>%
    filter(Characteristics..individual. == c('H1') | 
           Characteristics..individual. == c('H2') | 
           Characteristics..individual. == c('H3') | 
           Characteristics..individual. == c('H4') | 
           Characteristics..individual. == c('H5') | 
           Characteristics..individual. == c('H6'))

In [87]:
head(healthy_cell.meta)

Unnamed: 0_level_0,Source.Name,Characteristics..organism.,Characteristics..individual.,Characteristics..sex.,Characteristics..age.
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>
1,AZ_A1,Homo sapiens,H1,male,43
2,AZ_A3,Homo sapiens,H1,male,43
3,AZ_A4,Homo sapiens,H1,male,43
4,AZ_B6,Homo sapiens,H1,male,43
5,AZ_B7,Homo sapiens,H1,male,43
6,AZ_C12,Homo sapiens,H1,male,43


Now we only choose cells from healthy individuals.

In [96]:
table(cell.meta$Characteristics..individual.)


  H1   H2   H3   H4   H5   H6 T2D1 T2D2 T2D3 T2D4 
  96  352  383  383  383  383  383  383  384  384 

In [92]:
96+352+383+383+383+383

Looking at the table, we know there should be 1980 individuals.

In [93]:
df = df[healthy_cell.meta$Source.Name,]
dim(df)

## Quality control

In [122]:
head(df)

Unnamed: 0,SGIP1,AZIN2,CLIC4,AGBL4,NECAP2,SLC45A1,TGFBR3,DBT,RFWD2,C1orf21,⋯,ERCC_1.83105469:mix1_3.66210938:mix2,ERCC_0.91552734:mix1_1.83105469:mix2,ERCC_0.91552734:mix1_1.83105469:mix2.1,ERCC_0.45776367:mix1_0.91552734:mix2,ERCC_0.22888184:mix1_0.45776367:mix2,ERCC_0.22888184:mix1_0.45776367:mix2.1,ERCC_0.11444092:mix1_0.22888184:mix2,ERCC_0.05722046:mix1_0.11444092:mix2,ERCC_0.01430512:mix1_0.02861023:mix2,eGFP
AZ_A1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AZ_A3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,1
AZ_A4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AZ_B6,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,1
AZ_B7,0,0,0,0,0,0,0,0,0,0,⋯,0,1,0,0,0,0,0,0,0,0
AZ_C12,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


Remove duplicated genes.

In [160]:
sum(duplicated(rownames(df)))

In [161]:
sum(duplicated(colnames(df)))

In [163]:
so <- CreateSeuratObject(counts = t(df), project = "midterm", min.cells = 3, min.features = 200)
so

“Feature names cannot have underscores ('_'), replacing with dashes ('-')”
“Data is of class matrix. Coercing to dgCMatrix.”


ERROR: Error in validObject(.Object): invalid class “LogMap” object: Duplicate rownames not allowed


## Getting baron data

In [94]:
# Hint:
# 1. Need to reorder the cells to make them match between count and cell.meta
# 2. Need to select the healthy individuals (and remove the T2D patients)

###########################################################################
# Paper: PMID 27667667
# Data link (available in paper):
# https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-5061
###########################################################################

baron1=read.csv('../data/GSE84133_RAW/GSM2230757_human1_umifm_counts.csv.gz')
baron2=read.csv('../data/GSE84133_RAW/GSM2230758_human2_umifm_counts.csv.gz')
baron3=read.csv('../data/GSE84133_RAW/GSM2230759_human3_umifm_counts.csv.gz')
baron4=read.csv('../data/GSE84133_RAW/GSM2230760_human4_umifm_counts.csv.gz')

In [95]:
head(baron1)

Unnamed: 0_level_0,X,barcode,assigned_cluster,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AA06,⋯,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,human1_lib1.final_cell_0001,GATGACGGAC-GGTGGGAT,acinar,0,4,0,0,0,0,0,⋯,0,0,0,0,0,0,2,0,0,1
2,human1_lib1.final_cell_0002,GAGCGTTGCT-ACCTTCTT,acinar,0,0,0,0,0,0,0,⋯,0,0,0,0,0,1,4,0,1,0
3,human1_lib1.final_cell_0003,CTTACGGG-CCATTACT,acinar,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,human1_lib1.final_cell_0004,GATGTACACG-TTAAACTG,acinar,0,0,0,0,0,0,0,⋯,1,0,0,0,0,1,3,1,0,0
5,human1_lib1.final_cell_0005,GAGATTGCGA-GTCGTCGT,acinar,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,1,0,0,1
6,human1_lib1.final_cell_0006,AATCCCACG-ATTCGACG,acinar,0,1,0,0,0,0,0,⋯,0,0,0,0,0,0,1,1,1,0


In [97]:
# Check that the columns are the same before combining rows
all(colnames(baron1)==colnames(baron2))
all(colnames(baron1)==colnames(baron3))
all(colnames(baron1)==colnames(baron4))

In [100]:
baron=rbind(baron1, baron2, baron3, baron4)
rm(baron1,baron2,baron3,baron4)
head(baron)

ERROR: Error in eval(expr, envir, enclos): object 'baron1' not found


In [101]:
cell.meta=baron[,1:3]
colnames(cell.meta)[1]='cell'
head(cell.meta)

Unnamed: 0_level_0,cell,barcode,assigned_cluster
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,human1_lib1.final_cell_0001,GATGACGGAC-GGTGGGAT,acinar
2,human1_lib1.final_cell_0002,GAGCGTTGCT-ACCTTCTT,acinar
3,human1_lib1.final_cell_0003,CTTACGGG-CCATTACT,acinar
4,human1_lib1.final_cell_0004,GATGTACACG-TTAAACTG,acinar
5,human1_lib1.final_cell_0005,GAGATTGCGA-GTCGTCGT,acinar
6,human1_lib1.final_cell_0006,AATCCCACG-ATTCGACG,acinar


In [105]:
cell.meta$individual=substr(cell.meta$cell, start=1, stop=6)
head(cell.meta, 3)

Unnamed: 0_level_0,cell,barcode,assigned_cluster,individual
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,human1_lib1.final_cell_0001,GATGACGGAC-GGTGGGAT,acinar,human1
2,human1_lib1.final_cell_0002,GAGCGTTGCT-ACCTTCTT,acinar,human1
3,human1_lib1.final_cell_0003,CTTACGGG-CCATTACT,acinar,human1


In [106]:
table(cell.meta$individual)


human1 human2 human3 human4 
  1937   1724   3605   1303 

In [107]:
count=as.matrix(baron[,-(1:3)])
rm(baron)
head(count)

A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AA06,AAAS,AACS,AACSP1,⋯,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
0,4,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,2,0,0,1
0,0,0,0,0,0,0,0,2,0,⋯,0,0,0,0,0,1,4,0,1,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,1,0,0,⋯,1,0,0,0,0,1,3,1,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,1,0,0,1
0,1,0,0,0,0,0,0,2,0,⋯,0,0,0,0,0,0,1,1,1,0


In [108]:
rownames(count)=cell.meta$cell
rownames(cell.meta)=cell.meta$cell
colnames(cell.meta)[3]='celltype'
head(count, 3)

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AA06,AAAS,AACS,AACSP1,⋯,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
human1_lib1.final_cell_0001,0,4,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,2,0,0,1
human1_lib1.final_cell_0002,0,0,0,0,0,0,0,0,2,0,⋯,0,0,0,0,0,1,4,0,1,0
human1_lib1.final_cell_0003,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
