In [12]:
!pip install fancyimpute
import numpy as np
import pandas as pd
from fancyimpute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold



In [4]:
'''
pre-processes melanoma data

'''

'\npre-processes melanoma data\n\n'

In [5]:
# read in data

raw_data = pd.read_csv('/total_PD_1.csv')
raw_data.tail()

  raw_data = pd.read_csv('/total_PD_1.csv')


Unnamed: 0.1,Unnamed: 0,GSM2445716,GSM2445717,GSM2445718,GSM2445719,GSM2445720,GSM2445721,GSM2445722,GSM2445723,GSM2445724,...,SRR5088909,SRR5088911,SRR5088913,SRR5088914,SRR5088916,SRR5088920,SRR5088922,SRR5088924,SRR5088926,SRR5088929
56265,snoZ6,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56266,snosnR66,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56267,uc_338,,,,,,,,,,...,0.003664563208162,0.031978130771298,0.0857459707118245,0.04071053428664,0.0512846604842121,0.158733159882816,0.18278951659712,0.0657273971922831,0.0777414803400673,0.0
56268,yR211F11.2,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0355281824962792,0.0
56269,Response,N,N,N,N,N,N,N,N,N,...,N,R,N,N,N,N,N,R,N,R


In [6]:
# some re-formatting

# rename column
raw_data = raw_data.rename(columns={raw_data.columns[0]: 'patient'})
print(raw_data.shape)

#drop GSE931 as it is already log2 transformed
raw_data_drop = raw_data.drop(raw_data.columns[raw_data.columns.str.contains("GSM24457")], axis=1)
print(raw_data_drop.shape)

# transpose so columns are genes and rows are patients
t_data = raw_data_drop.set_index('patient').T

# t_data = raw_data.set_index('patient').T
print(t_data.shape)

# drop rows(patient) with resopnse = UNK
t_data = t_data[t_data["Response"] != "UNK"]
print(t_data.shape)

# split data
labels = t_data['Response']
clean_data = t_data.drop(columns = ['Response'])


(56270, 158)
(56270, 133)
(132, 56270)
(125, 56270)


In [9]:
# drop any genes with > 90% missing data

clean_data = clean_data.dropna(thresh = clean_data.shape[0]*0.9, axis=1)

# drop genes with >80% 0 input
zero_percent = (clean_data == 0). mean(axis = 0)
clean_data = clean_data.loc[:,zero_percent<0.8]
print(clean_data.shape)

(125, 36109)


In [25]:
# log2 transformation

# note that samples from the Melanoma dataset GSE931 are already log2 transformed

# add 1 to all values for log2 transformation

clean_data = clean_data.applymap(lambda x: x+1 if isinstance(x, (int, float)) else x)

# subset GSE931 samples from rest
# gse931 = clean_data[clean_data.index.str.contains('GSM24457')]

# to_transform = clean_data[~clean_data.index.str.contains("GSM24457")]

# log2 transform
to_transform = clean_data
log2_data = to_transform.applymap(lambda x: np.log2(x) if isinstance(x, (int, float)) else x)

# re-join data

# log2_data = pd.concat([gse931, log2_data], axis = 0)
log2_data.head()


patient,5S_rRNA,7SK,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,...,snoU2-30,snoU2_19,snoU83B,snoZ196,snoZ278,snoZ40,snoZ6,snosnR66,uc_338,yR211F11.2
SRR7344546,1.0,1.696987,1.091735,1.358947,1.026103,7.397094,1.677985,1.025746,1.066346,1.0,...,3.13092825610652,3.28747466891185,0.0,0.0,0.0,0.0,0.0,0.0,0.416319623212903,0.0
SRR7344554,1.065841,1.833735,1.035857,1.444616,1.007553,7.803274,2.856599,1.114802,1.061148,1.069391,...,13.6818810287859,1.57521656581416,0.0,0.0,0.0,0.0,0.0,0.0,0.364929374639851,0.0
SRR7344556,1.083375,1.073505,1.0,1.0,1.006349,4.680682,1.857244,1.008343,1.161227,1.05853,...,4.23564377253204,0.264727735783252,0.0,0.0,0.0,0.0,0.0,0.0,0.487394747280299,0.0
SRR7344564,1.140068,1.015369,1.172214,2.500735,1.053827,7.155594,1.414766,1.027947,1.308682,1.022212,...,0.226744499164846,0.0,0.360729885034982,2.85341392207446,0.0,0.0,0.0,0.0,0.0747493692947006,0.0
SRR7344565,1.069427,1.027033,1.125603,2.172815,1.049669,6.694065,1.229754,1.537043,1.104657,1.038999,...,0.0,0.437981509864241,0.0,1.57476497928716,0.0,0.0,0.0,0.0,0.0583043344721167,0.0


In [11]:
# z-score transformation (for PCA)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(log2_data)

scaled_data = pd.DataFrame(scaled_data)
scaled_data.head()
print(scaled_data.shape)

(125, 36109)


In [27]:
# Filter top 100 genes by variance (for AI embedding)

gene_variances = log2_data.var(axis=0)

# create dataframe with gene variance
genes_variances_df = pd.DataFrame({
    'Gene': gene_variances.index,
    'Variance': gene_variances.values
})

# order genes in dataframe in descending order
genes_ordered_by_variance = genes_variances_df.sort_values(by='Variance', ascending=False)

# Top 100 genes
top100byvariance = genes_ordered_by_variance.head(100)
top100genes = top100byvariance.iloc[:, 0]
top100genes = top100genes.tolist()

# Extract these top 100 genes from original log2 transformed data
top100byvariance_data = log2_data.filter(items=top100genes)
print(top100byvariance_data.head)

<bound method NDFrame.head of patient      SNORA73B    SCARNA7      MT-TC      MT-TP   SCARNA5     MT-CO3  \
SRR7344546   9.731824   9.646297   6.229876   3.765837  5.757327   6.117420   
SRR7344554  11.598076  10.676854  10.213501   9.473020  8.031799   9.861927   
SRR7344556   6.025015   7.264096   9.361687   6.330630  5.481020  10.068468   
SRR7344564   3.739789   1.067773   8.573673   7.000488  1.000000  11.113846   
SRR7344565   4.208573   1.226365   8.601717   6.813453  1.071461  10.609543   
...               ...        ...        ...        ...       ...        ...   
SRR5088920   2.674031   1.000000  10.598941  10.858740  1.115100  13.427227   
SRR5088922   2.481037   1.000000  10.380735  11.593178  1.000000  13.260368   
SRR5088924   1.848072   1.000000   9.185120   9.154407  1.059689  12.450893   
SRR5088926   2.415050   1.000000   9.941028   9.934247  1.000000  14.031499   
SRR5088929   1.728713   1.331727   9.767131  10.993193  1.388574  13.889848   

patient        MT-ND3

  gene_variances = log2_data.var(axis=0)


In [28]:
# export '/content/total_PD_1.csv'

# imputed log2 transformed data
# imputed_data.to_csv('imputed_data_v3.csv')

#log2 transformed
log2_data.to_csv('log2_data_v3.csv')

# normalized version of above^ (for PCA)
scaled_data.to_csv('scaled_data_v3.csv')

# Top 100 genes filtered by variance (for AI embedding)
top100byvariance_data.to_csv('top100byvariance.csv')

# true labels (response y/n)
labels.to_csv('labels_v3.csv')