In [24]:
import pandas as pd
import os

#### EPIC

In [25]:
# GC
print('********* GC *********')
gc = pd.read_csv('../data/old/matrix.tsv', sep='\t', index_col=0)
gc = gc.T
gc = gc[gc.index != 'Sample name']
gc.sort_index(inplace=True)
print(f'Dataset shape: {gc.shape}')
print(f'Number of features: {gc.shape[1]}')
print(f'Number of missing values: {gc.isnull().sum().sum()}')
gc_labels = pd.read_csv('../data/old/batches.tsv', sep='\t', index_col=0)
gc_labels = gc_labels[gc_labels.index != 'Sample name']
print(gc_labels['Factor1'].value_counts())
print('*****************************')  

# Create GC-MS dataset
gc['Factor1'] = gc_labels["Factor1"].values
gc.to_csv('../data/old/gc_mc.csv', index=False)
print()

********* GC *********
Dataset shape: (78, 60)
Number of features: 60
Number of missing values: 0
CASE       39
CONTROL    39
Name: Factor1, dtype: int64
*****************************



In [27]:
# LC-MS (+)
print('********* LC-MS (+) *********')
df_np = pd.read_csv('../data/old/EPIC_norm_pos.txt', sep='\t')
df_np.set_index('Samples', inplace=True)
df_np.sort_index(inplace=True)
print(f'Dataset shape: {df_np.shape}')
print(f'Number of features: {df_np.shape[1]-1}')
print(f'Number of missing values: {df_np.isnull().sum().sum()}')
print(df_np['group'].value_counts())
print('*****************************')

df_np.to_csv('../data/old/lc_ms_pos.csv', index=False)
print()

********* LC-MS (+) *********
Dataset shape: (78, 510)
Number of features: 509
Number of missing values: 2
Factor1:CASE       39
Factor1:CONTROL    39
Name: group, dtype: int64
*****************************



In [29]:
# CE-MS
print('********* CE-MS *********')
df_rp = pd.read_csv('../data/old/EPIC_rev_pos.txt', sep='\t')
df_rp.set_index('Samples', inplace=True)
df_rp.sort_index(inplace=True)
print(f'Dataset shape: {df_rp.shape}')
print(f'Number of features: {df_rp.shape[1]-1}')
print(f'Number of missing values: {df_rp.isnull().sum().sum()}')
print(df_rp['group'].value_counts())
print('*****************************')

df_rp.to_csv('../data/old/ce_ms.csv', index=False)
print()

********* CE-MS *********
Dataset shape: (78, 330)
Number of features: 329
Number of missing values: 0
Factor1:CASE       39
Factor1:CONTROL    39
Name: group, dtype: int64
*****************************



In [32]:
# LC-MS (-)
print('********* LC-MS (-) *********')
df_rn = pd.read_csv('../data/old/EPIC_rev_neg.txt', sep='\t')
df_rn.set_index('Samples', inplace=True)
df_rn.sort_index(inplace=True)
print(f'Dataset shape: {df_rn.shape}')
print(f'Number of features: {df_rn.shape[1]-1}')
print(f'Number of missing values: {df_rn.isnull().sum().sum()}')
print(df_rn['group'].value_counts())
print('*****************************')

df_rn.to_csv('../data/old/lc_ms_neg.csv', index=False)
print()

********* LC-MS (-) *********
Dataset shape: (78, 533)
Number of features: 532
Number of missing values: 2660
Factor1:CASE       39
Factor1:CONTROL    39
Name: group, dtype: int64
*****************************



In [38]:
df_np.drop('group', axis=1, inplace=True)
df_rp.drop('group', axis=1, inplace=True)
df_rn.drop('group', axis=1, inplace=True)
df_rn.index = df_np.index

In [42]:
df = pd.concat([gc, df_np, df_rn, df_rp], axis=1)
print(df.shape)
df['Factor1'].value_counts()

(78, 1431)


CASE       39
CONTROL    39
Name: Factor1, dtype: int64

In [43]:
df.to_csv('../data/old/composite_dataset.csv', index=False)

### NHS

In [47]:
# Read excel file with multiple sheets
xlsx = pd.ExcelFile('../data/old/NHS_data.xlsx')
print(xlsx.sheet_names)

# Read the sheets into dataframes
df_nhs1 = pd.read_excel(xlsx, 'Figure 1 DN Control')
df_nhs2 = pd.read_excel(xlsx, 'Figure 1 Medicated Control')

['Figure 1 DN Control', 'Figure 1 Medicated Control', 'Figure 2 & SI Figure S2', 'Figure 3', 'Figure 4', 'SI Figure S1', 'SI Figure S3']


In [48]:
df_nhs1.set_index('repidx', inplace=True)   
df_nhs2.set_index('repidx', inplace=True)   

In [49]:
# LC-MS (+) - Healthy control
print('********* Healthy control *********')
print(f'Dataset shape: {df_nhs1.shape}')
print(f'Number of features: {df_nhs1.shape[1]-1}')
print(f'Number of missing values: {df_nhs1.isnull().sum().sum()}')
print(df_nhs1['label'].value_counts())
df_nhs1.to_csv('../data/nhs_healthy.csv')
print('*****************************')
print()

# LC-MS (+) - Drug naïve 
print('********* Drug naïve *********')
print(f'Dataset shape: {df_nhs2.shape}')
print(f'Number of features: {df_nhs2.shape[1]-1}')
print(f'Number of missing values: {df_nhs2.isnull().sum().sum()}')
print(df_nhs2['label'].value_counts())
df_nhs2.to_csv('../data/nhs_drug_naive.csv')
print('*****************************')
print()

********* Healthy control *********
Dataset shape: (136, 6503)
Number of features: 6502
Number of missing values: 0
2    80
1    56
Name: label, dtype: int64
*****************************

********* Drug naïve *********
Dataset shape: (194, 6503)
Number of features: 6502
Number of missing values: 0
3    138
1     56
Name: label, dtype: int64
*****************************

