In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as plt
import csv
import pickle #to save notebook at sessions

from sklearn.preprocessing import RobustScaler

#set path for pickles to be saved in
pickle_path = '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/python pickles/'

In [3]:
print(pd.__version__)

1.4.4


# What is going on (on raw counts)
1. Import glycogene csv of human glycogenes converted to mice homologs (because our data is from mice tumor models. Human glycogenes are from glyco.me (Glycopacity). 
2. Load barcodes, features, and matrix of LN and TILs and convert to dataframe
3. Import transformed identity csv with info on which cells are associated with which cell type
4. Filter cells from original dataframe to produce smaller dataframe of just TILs in LN/TIL population from transformed identity csv
5. Filter genes from filtered dataframe (from 4) to only include glycogenes from (1)
6. Convert filtered dataframe back into a sparse matrix for export into .mtx file

## 1. Import human to mouse glycogene csv.
Converted human glycogenes to their mouse homologs via https://www.informatics.jax.org/batch/summary

In [4]:
# Load mouse glycogenes 

glyco_fp = '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/raw data/human to mouse glycogene converted.csv'
glycogene_df = pd.read_csv(glyco_fp)

#make mouse glycogene list
m_glyco = glycogene_df[['Mouse']]
mouse_glyco_long = list(m_glyco.dropna(axis=0)['Mouse']) #removed 3 genes that didn't have mice homologs

#export just the mouse glycogenes as csv
m_glyco.to_csv('Mouse glycogenes.csv')

In [5]:
# remove duplicates
mouse_glycogenes = set(mouse_glyco_long)
len(mouse_glyco_long)

264

In [6]:
len(mouse_glycogenes)

245

## 2. Load barcodes, features, and matrix of LN and TILs and convert to dataframe


In [5]:
## TAKES LONG TIME TO RUN!! ## 
# Load the LN SUGAR-seq matrix from file
sparse_LN = sp.io.mmread(
    '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/raw data/LN data/matrix.mtx')
# Convert to CSR format
sparse_LN = sp.sparse.csr_matrix(sparse_LN)

In [6]:
## TAKES LONG TIME TO RUN!! ## 
# Load the TIL SUGAR-seq matrix from file
sparse_TIL = sp.io.mmread(
    '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/raw data/TIL data/matrix.mtx')
# Convert to CSR format
sparse_TIL = sp.sparse.csr_matrix(sparse_TIL)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/raw data/TIL data/matrix.mtx'

In [None]:
# dimension check of sparse matrix
print('LN matrix dimesions are', np.shape(sparse_LN)) #31059 = rows , 21343=columns
print('TIL matrix dimesions are', np.shape(sparse_TIL))

# convert sparse matrices to array to put into dataframe
LN_matrix = sparse_LN.toarray()
TIL_matrix = sparse_TIL.toarray()

### Set up data frame for TIL

In [None]:
### TIL data ##  
# barcodes = row names
barcodes = pd.read_csv(
    '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/raw data/TIL data/barcodes copy.tsv', 
    header=None)
barcodes = barcodes.rename(columns={0: 'barcode'})
TIL_barcodes = list(barcodes['barcode']) # of columns =  21343

# features = column names
feats = pd.read_csv(
    '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/raw data/TIL data/features.tsv', 
    header = None)
with open('/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/raw data/TIL data/features.tsv', 
          'r') as infile:
    reader = csv.reader(infile, delimiter='\t')
    feat_ls = []
    for row in reader:
        feat_ls.append(row)
TIL_features = [i[1] for i in feat_ls] # of rows = 31059

# Convert the dense matrix to a pandas DataFrame
TIL_df = pd.DataFrame(data=TIL_matrix, index=TIL_features, columns=TIL_barcodes)

# Remove rows for HTO1, HTO2, PD1, and TIM3 (ADT and HTO assays)
TIL_df_new = TIL_df.drop(['HTO1', 'HTO2','PD1', 'TIM3'])

### set up dataframe for LN

In [None]:
## LN data ## 
# barcodes = row names
barcodes = pd.read_csv(
    '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/raw data/LN data/barcodes copy.tsv', 
    header=None)
barcodes = barcodes.rename(columns={0: 'barcode'})
LN_barcodes = list(barcodes['barcode']) # of columns =  21343

# features = column names
feats = pd.read_csv(
    '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/raw data/LN data/features.tsv', header = None)
with open('/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/raw data/LN data/features.tsv', 
          'r') as infile:
    reader = csv.reader(infile, delimiter='\t')
    feat_ls = []
    for row in reader:
        feat_ls.append(row)
LN_features = [i[1] for i in feat_ls] # of rows = 31059

# Convert the dense matrix to a pandas DataFrame with columns as each barcode and rows as each gene
LN_df = pd.DataFrame(data=LN_matrix, index=LN_features, columns=LN_barcodes)

# Remove rows for HTO1, HTO2, PD1, and TIM3 (ADT and HTO assays)
LN_df_new = LN_df.drop(['hashtag1', 'hashtag2','hashtag3', 'PD1_hash', 'TIM3_hash'])

In [None]:
with open(pickle_path + 'TIL df set up.pkl', 'wb') as f:
    pickle.dump(TIL_df_new, f)
    
with open(pickle_path + 'LN df set up.pkl', 'wb') as f:
    pickle.dump(LN_df_new, f)

f.close()

## 3. Add cell type info from csv exported from ProjecTIL cell type annotation

In [5]:
with open(pickle_path + 'TIL df set up.pkl', 'rb') as f:
    TIL_df_new = pickle.load(f)

with open(pickle_path + 'LN df set up.pkl', 'rb') as f:
    LN_df_new = pickle.load(f)

In [6]:
#check to see St6galnac2 -->  non-identical columns but named the same 
n = TIL_df_new.transpose()
n = n.filter(regex='^St6galnac2$')
names = ['1','2']
n.columns = names
n
print(n['1'].equals(n['2']))

False


In [7]:
path_TIL='/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/TIL output/ProjecTIL analysis/TIL_transformed_identity.csv'
path_LN='/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/LN output/ProjecTIL analysis/LN_transformed_identity.csv'

# import ProjecTIL analysis output to add cell type annotation to raw glycogene data frame 
TIL_transformed_ident = pd.read_csv(path_TIL)
LN_transformed_ident = pd.read_csv(path_LN)

#rename column of ProjecTIL df for easy identification of barcodes
TIL_transformed_ident = TIL_transformed_ident.rename(columns={'Unnamed: 0':'Barcodes'}) 
LN_transformed_ident = LN_transformed_ident.rename(columns={'Unnamed: 0':'Barcodes'}) 

#Transpose matrix to match final TIL/LN matrix format
TIL_transformed_ident = TIL_transformed_ident.transpose()
LN_transformed_ident = LN_transformed_ident.transpose()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/TIL output/ProjecTIL analysis/TIL_transformed_identity.csv'

In [10]:
#RUn only once, else it keeps replacing index with top row. if you run too much, rerun previous cell first!! 
'''
make the column names the barcodes, which is how the glycogene 
dataframe (the one we want to combine this one with) is formatted as, to concatenate
'''
TIL_transformed_ident.set_axis(TIL_transformed_ident.iloc[0], axis=1, inplace=True)
TIL_transformed_ident = TIL_transformed_ident[1:] 

LN_transformed_ident.set_axis(LN_transformed_ident.iloc[0], axis=1, inplace=True)
LN_transformed_ident = LN_transformed_ident[1:]

In [None]:
#save as binarized pickle file 
with open(pickle_path + 'TIL_transformed_ident.pkl', 'wb') as f:
    pickle.dump(TIL_transformed_ident, f)
    
with open(pickle_path + 'LN_transformed_ident.pkl', 'wb') as f:
    pickle.dump(LN_transformed_ident, f)

f.close()

### Remove non-TILs from dataframe

In [12]:
# # filter original matrix by barcodes of cells identified as TILs via ProjecTILs #

# extract list of barcodes that have a T-cell annotation
TIL_barcodes = list(TIL_transformed_ident.columns)
LN_barcodes = list(LN_transformed_ident.columns)


filtered_TIL_df = TIL_df_new[TIL_barcodes]
filtered_LN_df = LN_df_new[LN_barcodes]

print('Original dataframe dimensions are ', TIL_df_new.shape, 
      'and the filtered dataframe dimensions are', filtered_TIL_df.shape)
print('Original dataframe dimensions are ', LN_df_new.shape, 
      'and the filtered dataframe dimensions are', filtered_LN_df.shape)

Original dataframe dimensions are  (31054, 19912) and the filtered dataframe dimensions are (31054, 19645)
Original dataframe dimensions are  (31054, 21343) and the filtered dataframe dimensions are (31054, 21296)


In [None]:
#save as binarized pickle file 
with open(pickle_path + 'filtered_TIL_df.pkl', 'wb') as f:
    pickle.dump(filtered_TIL_df, f)
    
with open(pickle_path + 'filtered_LN_df.pkl', 'wb') as f:
    pickle.dump(filtered_LN_df, f)

f.close()

In [None]:
##TAKES LONG TIME TO RUN##
'''
Add cell type information via concatenation
Note: the L-Pha information from this csv is normalized, and since we're using raw data we'll use the raw biotin info
'''
TIL_type = TIL_transformed_ident.loc[['Type']]
LN_type = LN_transformed_ident.loc[['Type']]

#makes new dataframe that adds T-cell type annotations to raw gene expression dataframe 
TIL_type_df = pd.concat([filtered_TIL_df, TIL_type])
LN_type_df = pd.concat([filtered_LN_df, LN_type])

In [None]:
with open(pickle_path + 'TIL_type_df.pkl', 'wb') as f:
    pickle.dump(TIL_type_df, f)
    
with open(pickle_path + 'LN_type_df.pkl', 'wb') as f:
    pickle.dump(LN_type_df, f)

f.close()

## 4. DO normalization and glycoscoring of T-cells via full transcriptome 

In [2]:
with open(pickle_path + 'TIL_type_df.pkl', 'rb') as f:
    TIL_type_df = pickle.load(f)

with open(pickle_path + 'LN_type_df.pkl', 'rb') as f:
    LN_type_df = pickle.load(f)

In [3]:
TIL_type_df

Unnamed: 0,AAACCTGAGATGGCGT-1,AAACCTGAGCGAGAAA-1,AAACCTGAGCTGGAAC-1,AAACCTGAGGATTCGG-1,AAACCTGAGTCTCAAC-1,AAACCTGCAAATACAG-1,AAACCTGCAAGCCCAC-1,AAACCTGCAATAGAGT-1,AAACCTGCAGATCCAT-1,AAACCTGCAGATGGGT-1,...,TTTGTCATCAACACGT-1,TTTGTCATCAAGGTAA-1,TTTGTCATCAGAAATG-1,TTTGTCATCAGGCAAG-1,TTTGTCATCAGTGTTG-1,TTTGTCATCATGTCTT-1,TTTGTCATCCAATGGT-1,TTTGTCATCGGATGGA-1,TTTGTCATCTACTTAC-1,TTTGTCATCTAGCACA-1
Xkr4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm1992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm37381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rp1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sox17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CAAA01118383.1,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,2,0,0,0,0
Vmn2r122,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CAAA01147332.1,1,1,0,0,0,3,4,0,0,0,...,0,2,2,0,0,0,0,0,1,0
Biotin,1052,799,231,507,197,440,2265,227,221,1014,...,1185,1078,329,260,204,221,173,247,174,361


In [5]:
toy = TIL_type_df[:-2]
cols_to_standardize = toy.columns
scaler = RobustScaler()

In [None]:
col1 = toy[1:]
col1
std = col1.columns
col1[std]=scaler.fit_transform(col1[std])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  col1[std]=scaler.fit_transform(col1[std])


In [None]:
col1

In [None]:
toy[cols_to_standardize] = scaler.fit_transform(toy[cols_to_standardize])

In [None]:
# calculate the number of columns in each part
num_cols = toy.shape[1]
cols_per_part = int(np.ceil(num_cols / 4))

# select the columns for each part
part1 = toy.iloc[:, :cols_per_part]
part2 = toy.iloc[:, cols_per_part:2*cols_per_part]
part3 = toy.iloc[:, 2*cols_per_part:3*cols_per_part]
part4 = toy.iloc[:, 3*cols_per_part:]

In [None]:
scores_to_append = TIL_type_df.iloc[-2:]

norm_df = pd.concat([toy_df, scores_to_append])
norm_df = norm_df.transpose()

In [7]:
'''
Perform robust scaler normalization

This defines a function that normalizes expression across the different genes for each cell.

input:
- df with last 2 rows that are type, biotin
- rows = genes, columns = cells

output:
- scaled df with same dimensions
- df with columns as genes and rows as cells
'''

def robust_normalization(df):
    ## isolate just gene columns and standardize 
    toy_df = df.iloc[:-2]
    cols_to_standardize = toy_df.columns
    scaler = RobustScaler()
    toy_df[cols_to_standardize] = scaler.fit_transform(toy_df[cols_to_standardize])
    
    ## extract biotin, t-cell type, and L-PHA data from original dataframe and append to transformed data
    scores_to_append = df.iloc[-2:]

    norm_df = pd.concat([toy_df, scores_to_append])
    norm_df = norm_df.transpose()
   
    return norm_df  

In [7]:
'''
Now want to add Glycosylation (PHA-L) column that assigns:
Yes (1) to top 25%, 
No (0) to bottom 25%

categorize_lectin 
inputs:
- data_all: a dataframe that has
    - genes in the columns headers
    - barcodes as row indices
    - a column with biotin values to use as cutoffs for top 25% and bottom 25%
- quantile_high: upper quantile threshold (75% to represent top 25% of data below which 75% of the data falls)
- quantile_low: lower quantile threshold (25% to represent bottom 25% of data below which 25% of data falls)
- ref_col_loc: index of column in data_all that is used for categorization cutoffs

outputs:
- cutoff values for high and low biotin
- list of 3 arrays representing indices of rows that meet the high biotin cutoff, 
low biotin cutoff, and combined high and low cutoff
- number of rows that meet the high category cutoff and the low category cutoff.
'''

# Function: determine PHA-L read cut-offs for binary classification FROM BOJAR LAB
def categorize_lectin(data_all, quantile_high, quantile_low, ref_col_loc):
    cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()

    print(f"Cut-off for PHA-L high: {cutoff[0]}; Cut-off for PHA-L low: {cutoff[1]}")

    high_indices = np.array(data_all.loc[:,ref_col_loc]>=cutoff[0])
    low_indices = np.array(data_all.loc[:,ref_col_loc]<cutoff[1])
    high_low_indices = np.logical_or(high_indices, low_indices)

    high_count = high_indices.sum()
    low_count = low_indices.sum()

    return cutoff, [high_indices, low_indices, high_low_indices], [high_count, low_count]


'''
Function assigns binary glycoscore to input df

input:
- c = dataframe containing normalized counts of gene expression per cell
- Need last 2 columns to be 'Type' and 'Biotin' col
- genes on column, cell barcodes on row 

output:
- dataframe containing new column for L-PHA score 
- genes on column, barcodes on row 
'''

def glycoscore(c):
    #sort dataframe by 'Biotin values'
    c = c.sort_values(by='Biotin', ascending=False)
    # Parameters for categorize lectin function
    quantile_high, quantile_low = 0.75, 0.25
    ref_col = 'Biotin' #last column of dataframes contain biotin info

    #split df into quartiles 
    cutoff, indices, count = categorize_lectin(c, quantile_high, quantile_low, ref_col)

    # Assign 1 to top 25% and 0 to bottom 25%
    c.loc[indices[0], "PHA-L"] = 1
    c.loc[indices[1], "PHA-L"] = 0

    # Drop the middle two quartiles
    c = c.loc[indices[2], :]
    return c

In [None]:
TIL = TIL_type_df.copy()
LN = LN_type_df.copy()

In [None]:
n = TIL.transpose(a)
n = n.filter(regex='^St6galnac2$')
n

In [None]:
'''
Normalize gene counts across all genes for each T-cell

recall robust normalization will spit out TRANSPOSED dataframe
TAKES FOREVER
'''
normTIL = robust_normalization(TIL)
normLN = robust_normalization(LN)

In [None]:
# Rename Biotin_hash column in LN dataframe to 'Biotin' for easy glycoscoring via glycoscore function
pre_scoredLN = pre_scoredLN.rename(columns={'Biotin_hash': 'Biotin'})

In [None]:
# Assign glycoscores to each cell
TIL_glyconorm = = glycoscore(pre_scoredTIL)
LN_glyconorm = glycoscore(pre_scoredLN)

In [None]:
# save pickle of df
with open(pickle_path + 'full_TIL_glyconorm.pkl', 'wb') as f:
    pickle.dump(TIL_glyconorm, f)

## 5. Filter matrix to only include glycogenes

In [None]:
with open(pickle_path + 'full_TIL_glyconorm.pkl', 'rb') as f:
    full_TIL_glyconorm = pickle.load(f)

with open(pickle_path + 'full_LN_glyconorm.pkl', 'rb') as f:
    full_LN_glyconorm = pickle.load(f)

In [None]:
# FILTER the original matrix by the mouse glycogenes but keep biotin and type info
print('Total number of glycogenes looked for:', len(mouse_glycogenes))

#Look for glycogenes withixn TIL_type_df
TILglycogenes_found = [i for i in mouse_glycogenes if i in TIL_type_df.index]
TILglycogenes_notfound = [i for i in mouse_glycogenes if i not in TIL_type_df.index]

glycoTIL_df = TIL_type_df.loc[TILglycogenes_found + ['Biotin', 'Type']] #filter by glycogenes while keeping type&biotin
print('Number of glycogenes found in TILs:', len(glycoTIL_df)-2) #-2 accounts for extra type and biotin rows


#Look for glycogenes within LN_type_df
LNglycogenes_found = [i for i in mouse_glycogenes if i in LN_type_df.index]
LNglycogenes_notfound = [i for i in mouse_glycogenes if i not in LN_type_df.index]

glycoLN_df = LN_type_df.loc[LNglycogenes_found + ['Biotin_hash', 'Type']] #filter by glycogenes while keeping type&biotin
print('Number of glycogenes found in LNs:', len(glycoLN_df) - 2)

In [61]:
# n = glycoTIL_df.transpose()
# n = n.filter(regex='^St6galnac2$')
# n

Unnamed: 0,St6galnac2,St6galnac2.1
AAACCTGAGATGGCGT-1,0,0
AAACCTGAGCGAGAAA-1,0,0
AAACCTGAGCTGGAAC-1,0,0
AAACCTGAGGATTCGG-1,0,0
AAACCTGAGTCTCAAC-1,0,0
...,...,...
TTTGTCATCATGTCTT-1,0,0
TTTGTCATCCAATGGT-1,0,0
TTTGTCATCGGATGGA-1,0,0
TTTGTCATCTACTTAC-1,0,0


In [89]:
with open(pickle_path + 'glycoTIL_df.pkl', 'wb') as f:
    pickle.dump(glycoTIL_df, f)
    
with open(pickle_path + 'glycoLN_df.pkl', 'wb') as f:
    pickle.dump(glycoLN_df, f)

f.close()

In [None]:
'''
Couldn't find all the genes from glycogene list in dataset, see if we're missing any homologs etc...
THe same glycogenes that weren't found in TILs were also not found in LN
'''
TILnot = [i for i in mouse_glycogenes not in glycoTIL_df.index]
TILnot

# 6 Assign glycoscore

In [98]:
'''
RECALL:
glycoscore requires genes in columns and cells in rows
'''
glycoTIL_normscored = glycoscore(glycoTIL_df)
glycoLN_normscored = glycoscore(glycoLN_df)

Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()


Cut-off for PHA-L high: 738; Cut-off for PHA-L low: 237
Cut-off for PHA-L high: 276; Cut-off for PHA-L low: 127


In [111]:
normTIL

Unnamed: 0,B4galnt2,B4galnt4,Timm44,Dpy19l2,Bcap31,Mgat1,Hs3st4,Galnt3,B3galt2,A4gnt,...,Gcnt1,Mgat4d,Fut1,Alg1,Galnt13,B4galt4,Pdcd6,Mfng,Biotin,Type
AAACCTGAGATGGCGT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1052,Treg
AAACCTGAGCGAGAAA-1,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,799,Th1
AAACCTGAGCTGGAAC-1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,231,Treg
AAACCTGAGGATTCGG-1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,507,Th1
AAACCTGAGTCTCAAC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,197,CD8_EarlyActiv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCATGTCTT-1,0.0,1.0,2.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,221,CD8_EffectorMemory
TTTGTCATCCAATGGT-1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173,CD8_EffectorMemory
TTTGTCATCGGATGGA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247,CD8_NaiveLike
TTTGTCATCTACTTAC-1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,174,Th1


In [100]:
# save pickle of robust normalized, glycoscored df of just T-cells and 244 glycogenes (housekeeping included)
with open(pickle_path + 'glycoTIL_normscored.pkl', 'wb') as f:
    pickle.dump(glycoTIL_normscored, f)
    
with open(pickle_path + 'glycoLN_normscored.pkl', 'wb') as f:
    pickle.dump(glycoLN_normscored, f)

f.close()

# # ## load updated df from pickle
# pickle_in = open(pickle_path +"glycoTIL_normscored.pkl","rb")
# glycoTIL_normscored = pickle.load(pickle_in)

# pickle_in = open(pickle_path +"glycoLN_normscored.pkl","rb")
# glycoLN_normscored = pickle.load(pickle_in)

# 7. Split normalized df  into T-cell subsets

In [114]:
'''
SPLIT data from normLN and normTIL dataframes by t-cell subtype
store the dataframes into dictionary with t-cell type name as the key 

normLN and normTIL have cells in rows and genes + type + biotin in columns
'''

# Get list of T-cell subtypes for which to make a dataframe for TILs
tcell_subtypes = normTIL['Type'].unique()

#make dataframe names for later access
df_names = [i+'_df' for i in list(tcell_subtypes)]

# Make copy of original df just in case
splitTIL = normTIL.copy()
splitLN = normLN.copy()


TILtcell_dfs_sub = {}
LNtcell_dfs_sub = {}

# Make separate dataframe containing data for each t-cell subtype
for cell_type, name in zip(tcell_subtypes, df_names):
    TILtcell_dfs_sub[name] = splitTIL[splitTIL['Type'] == cell_type]
    LNtcell_dfs_sub[name] = splitLN[splitLN['Type'] == cell_type]

In [116]:
# save pickle 
with open(pickle_path + 'TILtcell_dfs_sub.pkl', 'wb') as f:
    pickle.dump(TILtcell_dfs_sub, f)

with open(pickle_path + 'LNtcell_dfs_sub.pkl', 'wb') as f:
    pickle.dump(LNtcell_dfs_sub, f)

# f.close()

# '''load dictionary containing dataframes of non-normalized glycogene dataframes of just T-cells'''
# pickle_in = open(pickle_path + "TILtcell_dfs_sub.pkl","rb")
# TILtcell_dfs_sub = pickle.load(pickle_in)
# pickle_in = open(pickle_path + "LNtcell_dfs_sub.pkl","rb")
# LNtcell_dfs_sub = pickle.load(pickle_in)

In [117]:
#extracting all dataframes from df dictionary
CD8_NaiveLike_df= TILtcell_dfs_sub['CD8_NaiveLike_df']
CD8_EffectorMemory_df = TILtcell_dfs_sub['CD8_EffectorMemory_df']
Th1_df = TILtcell_dfs_sub['Th1_df']
CD8_EarlyActiv_df = TILtcell_dfs_sub['CD8_EarlyActiv_df']
Treg_df = TILtcell_dfs_sub['Treg_df']
CD8_Tex_df = TILtcell_dfs_sub['CD8_Tex_df']
CD4_NaiveLike_df = TILtcell_dfs_sub['CD4_NaiveLike_df']
Tfh_df = TILtcell_dfs_sub['Tfh_df']
CD8_Tpex_df = TILtcell_dfs_sub['CD8_Tpex_df']

## 7B. Add glycoscores to each subtype df

In [121]:
TILglyconorm_sub_df = {}
LNglyconorm_sub_df = {}

# Make separate dataframe containing data for each t-cell subtype
for name in df_names:
    TILscored = glycoscore(TILtcell_dfs_sub[name])
    TILglyconorm_sub_df[name] = TILscored
    LNscored = glycoscore(LNtcell_dfs_sub[name])
    LNglyconorm_sub_df[name] = LNscored

Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolis

Cut-off for PHA-L high: 1426; Cut-off for PHA-L low: 519
Cut-off for PHA-L high: 2011; Cut-off for PHA-L low: 579
Cut-off for PHA-L high: 416; Cut-off for PHA-L low: 187
Cut-off for PHA-L high: 330; Cut-off for PHA-L low: 121
Cut-off for PHA-L high: 506; Cut-off for PHA-L low: 233
Cut-off for PHA-L high: 349; Cut-off for PHA-L low: 150
Cut-off for PHA-L high: 403; Cut-off for PHA-L low: 205


Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolis

Cut-off for PHA-L high: 253; Cut-off for PHA-L low: 126
Cut-off for PHA-L high: 1240; Cut-off for PHA-L low: 465
Cut-off for PHA-L high: 1691; Cut-off for PHA-L low: 624
Cut-off for PHA-L high: 648; Cut-off for PHA-L low: 231
Cut-off for PHA-L high: 400; Cut-off for PHA-L low: 133
Cut-off for PHA-L high: 873; Cut-off for PHA-L low: 297
Cut-off for PHA-L high: 937; Cut-off for PHA-L low: 343
Cut-off for PHA-L high: 325; Cut-off for PHA-L low: 162
Cut-off for PHA-L high: 182; Cut-off for PHA-L low: 105
Cut-off for PHA-L high: 500; Cut-off for PHA-L low: 215
Cut-off for PHA-L high: 379; Cut-off for PHA-L low: 182


Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolis

In [131]:
'''
save pickle of dictionary containing dataframe for each t-cell subtype normalized and scored
- with genes in cols, cells in rows
'''
with open(pickle_path + 'TILglyconorm_sub_df.pkl', 'wb') as f:
    pickle.dump(TILglyconorm_sub_df, f)

with open(pickle_path + 'LNglyconorm_sub_df.pkl', 'wb') as f:
    pickle.dump(LNglyconorm_sub_df, f)

f.close()

# '''load dictionary containing dataframes of non-normalized glycogene dataframes of just T-cells'''
# pickle_in = open(pickle_path + "TILglyconorm_sub_df.pkl","rb")
# TILglyconorm_sub_df = pickle.load(pickle_in)
# pickle_in = open(pickle_path + "LNglyconorm_sub_df.pkl","rb")
# LNglyconorm_sub_df = pickle.load(pickle_in)

### ASIDE: Make a concatenated dataframe that combines LN and TIL t-cells

In [69]:
#only run once!!
glycoTIL = glycosorted_TIL.copy()
glycoLN = glycosorted_LN.copy()

#Add info about where t-cells are from (LN or TIL) just in case as column
glycoTIL['Location'] = ['TIL' for i in range(len(glycoTIL))]
glycoLN['Location'] = ['LN' for i in range(len(glycoLN))]

#add location info to barcode identifier
glycoTIL.index = [f"{x}_TIL" for x in glycoTIL.index]
glycoLN.index = [f"{x}_LN" for x in glycoLN.index]

In [74]:
combo_raw = pd.concat([glycoTIL, glycoLN])
print('Initial dimensions of TIL:', glycoTIL.shape)
print('Initial dimensions of LN:', glycoLN.shape)
print('Combined dimensions:', combo_raw.shape)

Initial dimensions of TIL: (9824, 265)
Initial dimensions of LN: (10595, 265)
Combined dimensions: (20419, 265)


In [3]:
combo_raw

NameError: name 'combo_raw' is not defined

In [6]:
# # save pickle 
# with open(pickle_path + 'combo_raw.pkl', 'wb') as f:
#     pickle.dump(combo_raw, f)


# f.close()

# open via: 
#load updated df from pickle
pickle_in = open(pickle_path +"combo_raw.pkl","rb")
combo_raw = pickle.load(pickle_in)


In [9]:
combo_raw[combo_raw['Type'] == 'Tfh']

Unnamed: 0,Ahsa1,Api5,Atp6v1e1,Bcap31,Cops6,Csnk2b,Eif3i,Eif4g2,Gdi2,Hnrnpf,...,Galnt13,Galnt15,Galnt14,Dse,Dsel,Glce,Type,Biotin norm.,PHA-L,Location
AATCCAGCATATGCTG-1_TIL,2,0,0,0,0,2,3,1,2,5,...,0,0,0,0,0,0,Tfh,13596,1.0,TIL
GCCTCTATCTGGTTCC-1_TIL,2,1,0,12,0,1,2,7,1,8,...,0,0,0,2,0,0,Tfh,4131,1.0,TIL
GACGTTACACGCCAGT-1_TIL,4,2,1,12,9,6,16,8,3,3,...,0,0,0,1,0,0,Tfh,2879,1.0,TIL
GGACAGACAGCTGTAT-1_TIL,0,0,0,12,0,0,0,1,1,4,...,0,0,0,1,0,0,Tfh,1781,1.0,TIL
TGACTTTTCACATAGC-1_TIL,8,3,5,7,3,3,2,23,5,27,...,0,0,0,1,1,1,Tfh,1234,1.0,TIL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ATTGGACGTAAACACA-1_LN,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,Tfh,119,0.0,LN
CGGACGTAGTGGAGTC-1_LN,1,1,0,1,2,2,2,4,3,2,...,0,0,0,0,0,0,Tfh,112,0.0,LN
CTACATTCATTTCAGG-1_LN,0,1,0,0,2,0,0,4,1,4,...,0,0,0,0,0,0,Tfh,102,0.0,LN
TTGCGTCCAGTCAGCC-1_LN,0,1,0,0,0,1,2,1,1,5,...,0,0,0,0,0,0,Tfh,76,0.0,LN


### SPlit by subtype

In [46]:
'''
Now want to split up glycosorted dataframe into sub-dataframes for
each of the different T-cell subtypes. TO do this, this code block takes in the glycosorted
dataframe data as the input and spits out a dictionaryt containing:
- t-cell type as key 
- matrix containing raw expression data for dataframe as values

'''
# Get list of T-cell subtypes for which to make a dataframe for TILs
tcell_subtypes = glycosorted_TIL['Type'].unique()

#make dataframe names for later access
df_names = [i+'_df' for i in list(tcell_subtypes)]

# Make copy of original df just in case
split_df = glycosorted_TIL.copy()

TILtcell_dfs = {}

# Make separate dataframe containing data for each t-cell subtype
for cell_type, name in zip(tcell_subtypes, df_names):
    TILtcell_dfs[name] = split_df[split_df['Type'] == cell_type]
    
###----------------------------------------------------------------------
# Get list of T-cell subtypes for which to make a dataframe for LNs
tcell_subtypes = glycosorted_LN['Type'].unique()

#make dataframe names for later access
df_names = [i+'_df' for i in list(tcell_subtypes)]


# Make copy of original df just in case
split_df = glycosorted_LN.copy()

LNtcell_dfs = {}
# Make separate dataframe containing data for each t-cell subtype
for cell_type, name in zip(tcell_subtypes, df_names):
    LNtcell_dfs[name] = split_df[split_df['Type'] == cell_type]

In [47]:
# save pickle 
with open(pickle_path + 'TILtcell_dfs.pkl', 'wb') as f:
    pickle.dump(TILtcell_dfs, f)
    
with open(pickle_path + 'LNtcell_dfs.pkl', 'wb') as f:
    pickle.dump(LNtcell_dfs, f)

f.close()

# open via: 
# #load updated df from pickle
# pickle_in = open(pickle_path + "TILtcell_dfs.pkl","rb")
# TILtcell_dfs = pickle.load(pickle_in)

# pickle_in = open(pickle_path + "LNtcell_dfs.pkl","rb")
# LNtcell_dfs = pickle.load(pickle_in)