In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import csv
import pickle #to save notebook at sessions

from sklearn.preprocessing import RobustScaler

#set path for pickles to be saved in
pickle_path = '/Users/erikazhang/Dropbox (MIT)/20.440 Biological Networks/project/python pickles/'


with open('mouse_glycogenes.pkl', 'rb') as f:
    mouse_glycogenes = pickle.load(f)

In [3]:
print(pd.__version__)

2.0.1


## Defining key functions

In [None]:
'''
Now want to add Glycosylation (PHA-L) column that assigns:
Yes (1) to top 25%, 
No (0) to bottom 25%

categorize_lectin 
inputs:
- data_all: a dataframe that has
    - genes in the columns headers
    - barcodes as row indices
    - a column with biotin values to use as cutoffs for top 25% and bottom 25%
- quantile_high: upper quantile threshold (75% to represent top 25% of data below which 75% of the data falls)
- quantile_low: lower quantile threshold (25% to represent bottom 25% of data below which 25% of data falls)
- ref_col_loc: index of column in data_all that is used for categorization cutoffs

outputs:
- cutoff values for high and low biotin
- list of 3 arrays representing indices of rows that meet the high biotin cutoff, 
low biotin cutoff, and combined high and low cutoff
- number of rows that meet the high category cutoff and the low category cutoff.
'''

# Function: determine PHA-L read cut-offs for binary classification FROM BOJAR LAB
def categorize_lectin(data_all, quantile_high, quantile_low, ref_col_loc):
    cutoff = np.quantile(data_all.loc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()

    print(f"Cut-off for PHA-L high: {cutoff[0]}; Cut-off for PHA-L low: {cutoff[1]}")

    high_indices = np.array(data_all.loc[:,ref_col_loc]>=cutoff[0])
    low_indices = np.array(data_all.loc[:,ref_col_loc]<cutoff[1])
    high_low_indices = np.logical_or(high_indices, low_indices)

    high_count = high_indices.sum()
    low_count = low_indices.sum()

    return cutoff, [high_indices, low_indices, high_low_indices], [high_count, low_count]


'''
Function assigns binary glycoscore to input df

input:
- c = dataframe containing normalized counts of gene expression per cell
- Need last 2 columns to be 'Type' and 'Biotin' col
- genes on column, cell barcodes on row 

output:
- dataframe containing new column for L-PHA score 
- genes on column, barcodes on row 
'''

def glycoscore(c):
    #sort dataframe by 'Biotin values'
    c = c.sort_values(by='Biotin', ascending=False)
    # Parameters for categorize lectin function
    quantile_high, quantile_low = 0.75, 0.25
    ref_col = 'Biotin' #last column of dataframes contain biotin info

    #split df into quartiles 
    cutoff, indices, count = categorize_lectin(c, quantile_high, quantile_low, ref_col)

    # Assign 1 to top 25% and 0 to bottom 25%
    c.loc[indices[0], "PHA-L"] = 1
    c.loc[indices[1], "PHA-L"] = 0

    # Drop the middle two quartiles
    c = c.loc[indices[2], :]
    return c

# This script randomly splits full T cell dataset into three random equally sized dataframes by cells to generate three independent models for statistical analysis for i) full transcriptome and ii) glycogene set. 

These dataframes are ALREADY NORMALIZED across full transcriptome, no furhter normalization required

## 1. Split full transcriptome
### TILs

In [1]:
#import dataframe that has already been normalized across full transcriptome for each T cell 
pickle_in = open("normTIL_full.pkl","rb")
normTIL_full = pickle.load(pickle_in)

NameError: name 'pickle' is not defined

In [None]:
TIL_type_df_t = TIL_type_df_t.loc[:, ~TIL_type_df_t.columns.duplicated()]

In [None]:
'''
ROws must be cells, columns must be genes
'''
# Shuffle the rows in the original dataframe randomly
TIL_type_shuf = TIL_type_df_t.sample(frac=1, random_state=42)

# Determine the size of each dataframe
n_rows = len(TIL_type_shuf)
n_rows_1 = n_rows_2 = n_rows_3 = n_rows // 3

# Calculate the remainder and distribute the remaining rows across the three dataframes
remainder = n_rows % 3
if remainder == 1:
    n_rows_1 += 1
elif remainder == 2:
    n_rows_1 += 1
    n_rows_2 += 1

# Slice the shuffled dataframe into three dataframes based on the determined sizes
TIL_type_df_1 = TIL_type_shuf.iloc[:n_rows_1]
TIL_type_df_2 = TIL_type_shuf.iloc[n_rows_1:n_rows_1+n_rows_2]
TIL_type_df_3 = TIL_type_shuf.iloc[n_rows_1+n_rows_2:]

# Print the number of rows in each of the three dataframes
print(len(TIL_type_df_1))
print(len(TIL_type_df_2))
print(len(TIL_type_df_3))

In [None]:
with open('TIL_type_df_1.pkl', 'wb') as f:
    pickle.dump(TIL_type_df_1, f)      
with open('TIL_type_df_2.pkl', 'wb') as f:
    pickle.dump(TIL_type_df_2, f)
with open('TIL_type_df_3.pkl', 'wb') as f:
    pickle.dump(TIL_type_df_3, f)

### LNs

In [None]:
pickle_in = open("normLN_full.pkl","rb")
normLN_full = pickle.load(pickle_in)

In [None]:
#remove duplicate columns
LN_type_df_t = LN_type_df_t.loc[:, ~LN_type_df_t.columns.duplicated()]

In [None]:
# Shuffle the rows in the original dataframe randomly
LN_type_shuf = LN_type_df_t.sample(frac=1, random_state=42)

# Determine the size of each dataframe
n_rows = len(LN_type_shuf)
n_rows_1 = n_rows_2 = n_rows_3 = n_rows // 3

# Calculate the remainder and distribute the remaining rows across the three dataframes
remainder = n_rows % 3
if remainder == 1:
    n_rows_1 += 1
elif remainder == 2:
    n_rows_1 += 1
    n_rows_2 += 1

# Slice the shuffled dataframe into three dataframes based on the determined sizes
LN_type_df_1 = LN_type_shuf.iloc[:n_rows_1]
LN_type_df_2 = LN_type_shuf.iloc[n_rows_1:n_rows_1+n_rows_2]
LN_type_df_3 = LN_type_shuf.iloc[n_rows_1+n_rows_2:]

# Print the number of rows in each of the three dataframes
print(len(LN_type_df_1))
print(len(LN_type_df_2))
print(len(LN_type_df_3))

In [None]:
with open('LN_type_df_1.pkl', 'wb') as f:
    pickle.dump(LN_type_df_1, f)      
with open('LN_type_df_2.pkl', 'wb') as f:
    pickle.dump(LN_type_df_2, f)
with open('LN_type_df_3.pkl', 'wb') as f:
    pickle.dump(LN_type_df_3, f)

## Glycoscore each subset of data for full transcriptome 

### for TILs

In [None]:
'''
Assign glycoscores to each cell. 
INPUT df must have genes, biotin score, and type in columns and cells in rows
'''
with open('TIL_type_df_1.pkl', 'rb') as f:
    TIL_type_df_1 = pickle.load(f)
    
TIL_glyconorm_1 = glycoscore(TIL_type_df_1)

with open('TIL_glyconorm_1.pkl', 'wb') as f:
    pickle.dump(TIL_glyconorm_1, f)

In [None]:
with open('TIL_type_df_2.pkl', 'rb') as f:
    TIL_type_df_2 = pickle.load(f)

TIL_glyconorm_2 = glycoscore(TIL_type_df_2)

with open('TIL_glyconorm_2.pkl', 'wb') as f:
    pickle.dump(TIL_glyconorm_2, f)

In [None]:
with open('TIL_type_df_3.pkl', 'rb') as f:
    TIL_type_df_3 = pickle.load(f)
    
TIL_glyconorm_3 = glycoscore(TIL_type_df_3)

with open('TIL_glyconorm_3.pkl', 'wb') as f:
    pickle.dump(TIL_glyconorm_3, f)

In [None]:
#make dictionary that contains the three dataframes
TILglyconorm_split = {}
TILglyconorm_split['1']= TIL_glyconorm_1
TILglyconorm_split['2']= TIL_glyconorm_2
TILglyconorm_split['3']= TIL_glyconorm_3

with open('TILglyconorm_split.pkl', 'wb') as f:
    pickle.dump(TILglyconorm_split, f)

In [None]:
with open('TILglyconorm_split.pkl', 'wb') as f:
    TILglyconorm_split = pickle.load(f)
TILglyconorm_split

### for LNs

In [None]:
with open('LN_type_df_1.pkl', 'rb') as f:
    LN_type_df_1 = pickle.load(f)    
    
LN_glyconorm_1 = glycoscore(LN_type_df_1)
with open('LN_glyconorm_1.pkl', 'wb') as f:
    pickle.dump(LN_glyconorm_1, f)

In [None]:
with open('LN_type_df_2.pkl', 'rb') as f:
    LN_type_df_2 = pickle.load(f)    
LN_glyconorm_2 = glycoscore(LN_type_df_2)
with open('LN_glyconorm_2.pkl', 'wb') as f:
    pickle.dump(LN_glyconorm_2, f)

In [None]:
with open('normLN_full_3.pkl', 'rb') as f:
    normLN_3 = pickle.load(f)    
LN_glyconorm_3 = glycoscore(normLN_3)
with open('LN_glyconorm_3.pkl', 'wb') as f:
    pickle.dump(LN_glyconorm_3, f)

In [None]:
#make dictionary that contains the three dataframes
LNglyconorm_split = {}
LNglyconorm_split['1']= LN_glyconorm_1
LNglyconorm_split['2']= LN_glyconorm_2
LNglyconorm_split['3']= LN_glyconorm_3

with open('LNglyconorm_split.pkl', 'wb') as f:
    pickle.dump(LNglyconorm_split, f)


#### LNglyconorm_split and TILglyconorm_split
are dictionaries that contain three dataframes of equal sizes that have been normalized previously across
full dataset and glycoscored


# 2. Filter split datasets to only include glycogenes (incl. housekeeping)

In [10]:
'''
These dataframes have genes+biotin+type+glycoscore in columns and cells in rows
these dataframes ARE normalized already (normalized once before splitting)
'''
with open('TILglyconorm_split.pkl', 'rb') as f:
    TILglyconorm_split = pickle.load(f)
with open('LNglyconorm_split.pkl', 'rb') as f:
    LNglyconorm_split = pickle.load(f)

In [21]:
'''
filter the original matrix by the mouse glycogenes but keep biotin, type info
NOTE: input dataframe must have genes in rows and cells in columns

output: list of dfs that have been filtered for glycogenes
'''
print('Total number of glycogenes looked for:', len(mouse_glycogenes))

#TIL
TIL1 = TIL_type_df_1.transpose()
TIL2 = TIL_type_df_2.transpose()
TIL3 = TIL_type_df_3.transpose()
TIL_dfs = [TIL1, TIL2, TIL3]
glycoTIL_dfs = []

for df in TIL_dfs:
    found = [i for i in mouse_glycogenes if i in df.index]
    notfound = [i for i in mouse_glycogenes if i not in df.index]
    glyco_df = df.loc[found + ['Biotin', 'Type', 'PHA-L']] 
    print('Number of glycogenes found in TILs:', len(glyco_df)-3) #-2 accounts for extra type, biotin, score rows
    glycoTIL_dfs.append(glyco_df)

Total number of glycogenes looked for: 245
Number of glycogenes found in TILs: 240
Number of glycogenes found in TILs: 240
Number of glycogenes found in TILs: 240


In [25]:
#LN
LN_type_df_1 = LN_type_df_1.rename(columns = {'Biotin_hash':'Biotin'})
LN_type_df_2 = LN_type_df_2.rename(columns = {'Biotin_hash':'Biotin'})
LN_type_df_3 = LN_type_df_3.rename(columns = {'Biotin_hash':'Biotin'})

LN1 = LN_type_df_1.transpose()
LN2 = LN_type_df_2.transpose()
LN3 = LN_type_df_3.transpose()
LN_dfs = [LN1, LN2, LN3]
glycoLN_dfs = []

for df in LN_dfs:
    found = [i for i in mouse_glycogenes if i in df.index]
    notfound = [i for i in mouse_glycogenes if i not in df.index]
    glyco_df = df.loc[found + ['Biotin', 'Type', 'PHA-L']]
    print('Number of glycogenes found in LNs:', len(glyco_df)-3) #-3 accounts for extra type, biotin, score rows
    glycoLN_dfs.append(glyco_df)

Number of glycogenes found in LNs: 240
Number of glycogenes found in LNs: 240
Number of glycogenes found in LNs: 240


In [None]:
glycoTIL_dict = {}
glycoTIL_dict['1'] = glycoTIL_dfs[0]
glycoTIL_dict['2'] = glycoTIL_dfs[1]
glycoTIL_dict['3'] = glycoTIL_dfs[2]

glycoLN_dict = {}
glycoLN_dict['1'] = glycoLN_dfs[0]
glycoLN_dict['2'] = glycoLN_dfs[1]
glycoLN_dict['3'] = glycoLN_dfs[2]

In [None]:
with open('glycoTIL_normscored_split.pkl', 'wb') as f:
    pickle.dump(glycoTIL_normscored_split, f)  
with open('glycoLN_normscored_split.pkl', 'wb') as f:
    pickle.dump(glycoLN_normscored_split, f)  