In [41]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import linregress

In [62]:
data3 = pd.read_csv('scRNA_Data/GSE139495_1.tsv', sep='\t')
data3.head(15)

Unnamed: 0,cells,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,alra_snn_res.0.5,seurat_clusters,samples
0,ACAGCTAAGGGTTCCC,Unfiltered_control,7,4,0.0,9,9,Unfiltered_control
1,ACATACGTCATTTGGG,Unfiltered_control,7,7,0.03601,9,9,Unfiltered_control
2,ACATCAGAGTCGTACT,Unfiltered_control,11,6,0.0,9,9,Unfiltered_control
3,AGAGTGGGTGCAACTT,Unfiltered_control,7,4,0.150754,9,9,Unfiltered_control
4,AGGCCACGTTCGCGAC,Unfiltered_control,9,2,0.0,9,9,Unfiltered_control
5,AGTGGGACATCCGGGT,Unfiltered_control,9,4,0.0,9,9,Unfiltered_control
6,ATCACGAGTCCAGTTA,Unfiltered_control,8,8,0.0,9,9,Unfiltered_control
7,ATCATGGGTCCGACGT,3,9,8,0.0,9,9,Filtered_samples
8,ATTACTCCAGATCTGT,Unfiltered_control,12,5,0.0,9,9,Unfiltered_control
9,CAAGATCGTCCATCCT,3,13,5,0.0,9,9,Filtered_samples


In [40]:
data3.columns

Index(['cells', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt',
       'alra_snn_res.0.5', 'seurat_clusters', 'samples'],
      dtype='object')

In [58]:
data3['orig.ident'].unique()

array(['Unfiltered_control', '3', '8,9,10', '2', '1', '4', '6', '7', '5',
       '11', 'Failed_capture'], dtype=object)

In [66]:
data3 = data3[['cells', 'orig.ident']]
data3.head()

Unnamed: 0,cells,orig.ident
0,ACAGCTAAGGGTTCCC,Unfiltered_control
1,ACATACGTCATTTGGG,Unfiltered_control
2,ACATCAGAGTCGTACT,Unfiltered_control
3,AGAGTGGGTGCAACTT,Unfiltered_control
4,AGGCCACGTTCGCGAC,Unfiltered_control


In [78]:
data3_renamed = data3.rename(columns = {'orig.ident':'patient_number'})
data3_renamed

Unnamed: 0,cells,patient_number
0,ACAGCTAAGGGTTCCC,Unfiltered_control
1,ACATACGTCATTTGGG,Unfiltered_control
2,ACATCAGAGTCGTACT,Unfiltered_control
3,AGAGTGGGTGCAACTT,Unfiltered_control
4,AGGCCACGTTCGCGAC,Unfiltered_control
...,...,...
12325,TGCTACCTCTCGCATC,8910
12326,TGGCTGGTCGGTGTTA,Unfiltered_control
12327,TTAGGACTCCACGACG,3
12328,TTGGAACGTCCCTTGT,Unfiltered_control


In [None]:
# 1. Separate data data3_renamed into 2 different dfs: one is for Unfiltered_control, 2nd is for metastatic patients (1 through 11)
# 2. We will exclude weird values in data4. (Values with - and .)(Optional)
# 3. We will assign patients (control and metastatic) to cells in data4.
# 4. We will train our datasets 
# 5. Elbow method (Deidra)
# 6. We will make our clusters 
# 7. We will assign names of clusters (Immune cell sinature genes or profile) (Nurmaa)
# 8. We will compare control and metastatic clusters to see if there is any differences in immune cells profiles (clusters)
# 9. Which gene is highest in the metastatic CTCs (Drivers of metastasis, so targeting these genes may reduce metastasis)
# 10. Which gene is lowest in the metastatic CTCs (Suppressed genes in metastasis, increasing those genes may reduce metastasis and improve patient survival)

In [80]:
data4 = pd.read_csv('scRNA_Data/GSE139495_2.tsv', sep='\t')
data4.head()

Unnamed: 0,ACAGCTAAGGGTTCCC,ACATACGTCATTTGGG,ACATCAGAGTCGTACT,AGAGTGGGTGCAACTT,AGGCCACGTTCGCGAC,AGTGGGACATCCGGGT,ATCACGAGTCCAGTTA,ATCATGGGTCCGACGT,ATTACTCCAGATCTGT,CAAGATCGTCCATCCT,...,TCGGGACAGGACTGGT,TCTATTGAGCCTCGTG,TCTTTCCAGTCTTGCA,TGCCAAAAGCGTCAAG,TGCGGGTTCCTCATTA,TGCTACCTCTCGCATC,TGGCTGGTCGGTGTTA,TTAGGACTCCACGACG,TTGGAACGTCCCTTGT,TTTACTGCATCCCACT
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
data4.columns

Index(['ACAGCTAAGGGTTCCC', 'ACATACGTCATTTGGG', 'ACATCAGAGTCGTACT',
       'AGAGTGGGTGCAACTT', 'AGGCCACGTTCGCGAC', 'AGTGGGACATCCGGGT',
       'ATCACGAGTCCAGTTA', 'ATCATGGGTCCGACGT', 'ATTACTCCAGATCTGT',
       'CAAGATCGTCCATCCT',
       ...
       'TCGGGACAGGACTGGT', 'TCTATTGAGCCTCGTG', 'TCTTTCCAGTCTTGCA',
       'TGCCAAAAGCGTCAAG', 'TGCGGGTTCCTCATTA', 'TGCTACCTCTCGCATC',
       'TGGCTGGTCGGTGTTA', 'TTAGGACTCCACGACG', 'TTGGAACGTCCCTTGT',
       'TTTACTGCATCCCACT'],
      dtype='object', length=12341)