# OBTAIN SAREK INPUT FILE FOR LUNG SAMPLE 

CREATE A CSV INPUT FILE FOR SAREK PIPELINE so it contains these structure:

patient,sex,status,sample,lane,fastq_1,fastq_2
patient1,XX,0,normal_sample,lane_1,test_L001_1.fastq.gz,test_L001_2.fastq.gz
patient1,XX,0,normal_sample,lane_2,test_L002_1.fastq.gz,test_L002_2.fastq.gz
patient1,XX,0,normal_sample,lane_3,test_L003_1.fastq.gz,test_L003_2.fastq.gz
patient1,XX,1,tumor_sample,lane_1,test2_L001_1.fastq.gz,test2_L001_2.fastq.gz
patient1,XX,1,tumor_sample,lane_2,test2_L002_1.fastq.gz,test2_L002_2.fastq.gz
patient1,XX,1,relapse_sample,lane_1,test3_L001_1.fastq.gz,test3_L001_2.fastq.gz

In [1]:
# Import pandas and read excel file with sequencing information
import pandas as pd
ruta = '/workspace/projects/sjd_melos/sequencing/LOPEBIG_57.xls'
metadata_df = pd.read_excel(ruta,skiprows= 2)
metadata_df

Unnamed: 0,FLOWCELL,LANE,MULTIPLEX INDEX,APPLICATION,LIBRARY BARCODE (internal id),SAMPLE BARCODE,SAMPLE NAME,SPECIES,MACHINE,FLI,LABCENTER
0,HVFHVDSX7,1,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,01NA,HVFHVDSX7_1_67UDI-idt-UMI,
1,HVFHVDSX7,2,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,01NA,HVFHVDSX7_2_67UDI-idt-UMI,
2,HVFHVDSX7,3,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,01NA,HVFHVDSX7_3_67UDI-idt-UMI,
3,HVFHVDSX7,4,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,01NA,HVFHVDSX7_4_67UDI-idt-UMI,
4,HVFKKDSX7,1,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,02NB,HVFKKDSX7_1_67UDI-idt-UMI,
5,HVFKKDSX7,2,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,02NB,HVFKKDSX7_2_67UDI-idt-UMI,
6,HVFKKDSX7,3,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,02NB,HVFKKDSX7_3_67UDI-idt-UMI,
7,HVFKKDSX7,4,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,02NB,HVFKKDSX7_4_67UDI-idt-UMI,


In [2]:
# Rename columns with no ID or confusing names
metadata_df = metadata_df.rename(columns={'FLI':'FASTQ_ID','SAMPLE BARCODE':'SAMPLE_BARCODE'})
metadata_df

Unnamed: 0,FLOWCELL,LANE,MULTIPLEX INDEX,APPLICATION,LIBRARY BARCODE (internal id),SAMPLE_BARCODE,SAMPLE NAME,SPECIES,MACHINE,FASTQ_ID,LABCENTER
0,HVFHVDSX7,1,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,01NA,HVFHVDSX7_1_67UDI-idt-UMI,
1,HVFHVDSX7,2,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,01NA,HVFHVDSX7_2_67UDI-idt-UMI,
2,HVFHVDSX7,3,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,01NA,HVFHVDSX7_3_67UDI-idt-UMI,
3,HVFHVDSX7,4,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,01NA,HVFHVDSX7_4_67UDI-idt-UMI,
4,HVFKKDSX7,1,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,02NB,HVFKKDSX7_1_67UDI-idt-UMI,
5,HVFKKDSX7,2,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,02NB,HVFKKDSX7_2_67UDI-idt-UMI,
6,HVFKKDSX7,3,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,02NB,HVFKKDSX7_3_67UDI-idt-UMI,
7,HVFKKDSX7,4,67UDI-idt-UMI,WG-Seq,4384AI,AX4941,B20-586,Homo sapiens,02NB,HVFKKDSX7_4_67UDI-idt-UMI,


In [3]:
# Remove unnecessary data of metadata tab for the sarek input file
sarek_df = metadata_df.drop(['MULTIPLEX INDEX','LIBRARY BARCODE (internal id)','SAMPLE NAME','SPECIES','MACHINE','LABCENTER'], axis=1)

# Modify LANE so it contains the number of the flowcell 

sarek_df['LANE']='_' + sarek_df['LANE'].astype(str)
sarek_df['lane'] = sarek_df['FLOWCELL'] + sarek_df['LANE']

# Insert the columns related to information of the patient
patient = 'patient1'
sarek_df['PATIENT'] = patient
sarek_df.head()

Unnamed: 0,FLOWCELL,LANE,APPLICATION,SAMPLE_BARCODE,FASTQ_ID,lane,PATIENT
0,HVFHVDSX7,_1,WG-Seq,AX4941,HVFHVDSX7_1_67UDI-idt-UMI,HVFHVDSX7_1,patient1
1,HVFHVDSX7,_2,WG-Seq,AX4941,HVFHVDSX7_2_67UDI-idt-UMI,HVFHVDSX7_2,patient1
2,HVFHVDSX7,_3,WG-Seq,AX4941,HVFHVDSX7_3_67UDI-idt-UMI,HVFHVDSX7_3,patient1
3,HVFHVDSX7,_4,WG-Seq,AX4941,HVFHVDSX7_4_67UDI-idt-UMI,HVFHVDSX7_4,patient1
4,HVFKKDSX7,_1,WG-Seq,AX4941,HVFKKDSX7_1_67UDI-idt-UMI,HVFKKDSX7_1,patient1


In [4]:
# Insert the columns related to information of the patient
patient = 'patient1'
sarek_df['PATIENT'] = patient
sarek_df

Unnamed: 0,FLOWCELL,LANE,APPLICATION,SAMPLE_BARCODE,FASTQ_ID,lane,PATIENT
0,HVFHVDSX7,_1,WG-Seq,AX4941,HVFHVDSX7_1_67UDI-idt-UMI,HVFHVDSX7_1,patient1
1,HVFHVDSX7,_2,WG-Seq,AX4941,HVFHVDSX7_2_67UDI-idt-UMI,HVFHVDSX7_2,patient1
2,HVFHVDSX7,_3,WG-Seq,AX4941,HVFHVDSX7_3_67UDI-idt-UMI,HVFHVDSX7_3,patient1
3,HVFHVDSX7,_4,WG-Seq,AX4941,HVFHVDSX7_4_67UDI-idt-UMI,HVFHVDSX7_4,patient1
4,HVFKKDSX7,_1,WG-Seq,AX4941,HVFKKDSX7_1_67UDI-idt-UMI,HVFKKDSX7_1,patient1
5,HVFKKDSX7,_2,WG-Seq,AX4941,HVFKKDSX7_2_67UDI-idt-UMI,HVFKKDSX7_2,patient1
6,HVFKKDSX7,_3,WG-Seq,AX4941,HVFKKDSX7_3_67UDI-idt-UMI,HVFKKDSX7_3,patient1
7,HVFKKDSX7,_4,WG-Seq,AX4941,HVFKKDSX7_4_67UDI-idt-UMI,HVFKKDSX7_4,patient1


In [5]:
# Reorder the columns
sarek_df = sarek_df[['PATIENT','SAMPLE_BARCODE', 'lane', 'FASTQ_ID']]

# Add column status: 0 is the reference sample, 1 for tumor samples
status ={'AX4941':'1'}
sarek_df['status']=sarek_df['SAMPLE_BARCODE'].map(status)
sarek_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sarek_df['status']=sarek_df['SAMPLE_BARCODE'].map(status)


Unnamed: 0,PATIENT,SAMPLE_BARCODE,lane,FASTQ_ID,status
0,patient1,AX4941,HVFHVDSX7_1,HVFHVDSX7_1_67UDI-idt-UMI,1
1,patient1,AX4941,HVFHVDSX7_2,HVFHVDSX7_2_67UDI-idt-UMI,1
2,patient1,AX4941,HVFHVDSX7_3,HVFHVDSX7_3_67UDI-idt-UMI,1
3,patient1,AX4941,HVFHVDSX7_4,HVFHVDSX7_4_67UDI-idt-UMI,1
4,patient1,AX4941,HVFKKDSX7_1,HVFKKDSX7_1_67UDI-idt-UMI,1
5,patient1,AX4941,HVFKKDSX7_2,HVFKKDSX7_2_67UDI-idt-UMI,1
6,patient1,AX4941,HVFKKDSX7_3,HVFKKDSX7_3_67UDI-idt-UMI,1
7,patient1,AX4941,HVFKKDSX7_4,HVFKKDSX7_4_67UDI-idt-UMI,1


In [6]:
# Add information about the path with new columns
ruta2 = '/workspace/datasafe/sjd_seq/20240119/FASTQ/'
sarek_df['PATH'] = ruta2

# Then reorder
sarek_df = sarek_df[['PATIENT','SAMPLE_BARCODE', 'lane','PATH','FASTQ_ID','status']]

# Add a new column with the file information related to the Fw/Rv file name
F_values = ['_1.fastq.gz'] * len(sarek_df) # we need that this value is applied in all the rows until the end
R_values = ['_2.fastq.gz'] * len(sarek_df)

sarek_df['F_values'] = F_values # add it into a new column
sarek_df['R_values'] = R_values

# Join the two columns of the dataframe corresponding to path and F/R route.
sarek_df['FASTQ_ID_F'] = sarek_df['FASTQ_ID'].astype(str) + sarek_df['F_values']
sarek_df['FASTQ_ID_R'] = sarek_df['FASTQ_ID'].astype(str) + sarek_df['R_values']

# Join the file names
sarek_df['PATH_FASTQ_F'] = sarek_df['PATH'] + sarek_df['FASTQ_ID_F']
sarek_df['PATH_FASTQ_R'] = sarek_df['PATH'] + sarek_df['FASTQ_ID_R']
sarek_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sarek_df['PATH'] = ruta2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sarek_df['F_values'] = F_values # add it into a new column
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sarek_df['R_values'] = R_values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

Unnamed: 0,PATIENT,SAMPLE_BARCODE,lane,PATH,FASTQ_ID,status,F_values,R_values,FASTQ_ID_F,FASTQ_ID_R,PATH_FASTQ_F,PATH_FASTQ_R
0,patient1,AX4941,HVFHVDSX7_1,/workspace/datasafe/sjd_seq/20240119/FASTQ/,HVFHVDSX7_1_67UDI-idt-UMI,1,_1.fastq.gz,_2.fastq.gz,HVFHVDSX7_1_67UDI-idt-UMI_1.fastq.gz,HVFHVDSX7_1_67UDI-idt-UMI_2.fastq.gz,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
1,patient1,AX4941,HVFHVDSX7_2,/workspace/datasafe/sjd_seq/20240119/FASTQ/,HVFHVDSX7_2_67UDI-idt-UMI,1,_1.fastq.gz,_2.fastq.gz,HVFHVDSX7_2_67UDI-idt-UMI_1.fastq.gz,HVFHVDSX7_2_67UDI-idt-UMI_2.fastq.gz,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
2,patient1,AX4941,HVFHVDSX7_3,/workspace/datasafe/sjd_seq/20240119/FASTQ/,HVFHVDSX7_3_67UDI-idt-UMI,1,_1.fastq.gz,_2.fastq.gz,HVFHVDSX7_3_67UDI-idt-UMI_1.fastq.gz,HVFHVDSX7_3_67UDI-idt-UMI_2.fastq.gz,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
3,patient1,AX4941,HVFHVDSX7_4,/workspace/datasafe/sjd_seq/20240119/FASTQ/,HVFHVDSX7_4_67UDI-idt-UMI,1,_1.fastq.gz,_2.fastq.gz,HVFHVDSX7_4_67UDI-idt-UMI_1.fastq.gz,HVFHVDSX7_4_67UDI-idt-UMI_2.fastq.gz,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
4,patient1,AX4941,HVFKKDSX7_1,/workspace/datasafe/sjd_seq/20240119/FASTQ/,HVFKKDSX7_1_67UDI-idt-UMI,1,_1.fastq.gz,_2.fastq.gz,HVFKKDSX7_1_67UDI-idt-UMI_1.fastq.gz,HVFKKDSX7_1_67UDI-idt-UMI_2.fastq.gz,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...


In [7]:
# Now you can delete columns used to create column PATH_FASTQ_F and PATH_FASTQ_Rv
sarek_df=sarek_df.drop(['PATH','FASTQ_ID_R','F_values','R_values','FASTQ_ID','FASTQ_ID_F','FASTQ_ID_R'],axis=1)

# Rename columns according to SAREK parameters: "patient,sample,lane,fastq_1,fastq_2"
sarek_df=sarek_df.rename(columns={'PATIENT': 'patient', 'SAMPLE_BARCODE':'sample', 'PATH_FASTQ_F':'fastq_1','PATH_FASTQ_R':'fastq_2'})
sarek_df

Unnamed: 0,patient,sample,lane,status,fastq_1,fastq_2
0,patient1,AX4941,HVFHVDSX7_1,1,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
1,patient1,AX4941,HVFHVDSX7_2,1,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
2,patient1,AX4941,HVFHVDSX7_3,1,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
3,patient1,AX4941,HVFHVDSX7_4,1,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
4,patient1,AX4941,HVFKKDSX7_1,1,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
5,patient1,AX4941,HVFKKDSX7_2,1,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
6,patient1,AX4941,HVFKKDSX7_3,1,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
7,patient1,AX4941,HVFKKDSX7_4,1,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...


In [8]:
# Check if there are errors in the path
print('Path:', sarek_df.iloc[0,4])

# Check if there are errors in the status/sample
print(sarek_df.iloc[4])
print('Table dimensions:',sarek_df.shape)

Path: /workspace/datasafe/sjd_seq/20240119/FASTQ/HVFHVDSX7_1_67UDI-idt-UMI_1.fastq.gz
patient                                             patient1
sample                                                AX4941
lane                                             HVFKKDSX7_1
status                                                     1
fastq_1    /workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
fastq_2    /workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
Name: 4, dtype: object
Table dimensions: (8, 6)


In [9]:
#Add column Sex:
Sex = ['XY'] * len(sarek_df)
sarek_df['sex'] = Sex # add it into a new column

# Reorder column and add a variable status:
sarek_df = sarek_df[['patient','sex','status','sample','lane','fastq_1','fastq_2']]

#Add column status
sarek_df['sample']
sarek_df

Unnamed: 0,patient,sex,status,sample,lane,fastq_1,fastq_2
0,patient1,XY,1,AX4941,HVFHVDSX7_1,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
1,patient1,XY,1,AX4941,HVFHVDSX7_2,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
2,patient1,XY,1,AX4941,HVFHVDSX7_3,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
3,patient1,XY,1,AX4941,HVFHVDSX7_4,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
4,patient1,XY,1,AX4941,HVFKKDSX7_1,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
5,patient1,XY,1,AX4941,HVFKKDSX7_2,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
6,patient1,XY,1,AX4941,HVFKKDSX7_3,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...
7,patient1,XY,1,AX4941,HVFKKDSX7_4,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...,/workspace/datasafe/sjd_seq/20240119/FASTQ/HVF...


In [10]:
# Check final tab is correct according to input structure
sarek_df.groupby(['patient','sample','lane']).count()
sarek_df.shape
sarek_df[(sarek_df['patient']=='patient1')&(sarek_df['sample']=='AX4941')].count()

patient    8
sex        8
status     8
sample     8
lane       8
fastq_1    8
fastq_2    8
dtype: int64

In [11]:
# Export to CSV
# sarek_df.to_csv('/workspace/nobackup2/scratch/melos/input_lung.csv',index=False)