# MHC-I Binding Dataset Preparation

In [77]:
import pandas as pd
import dask.dataframe as dd 

## Read single `csv` from single folder.

In [67]:
def readData(data, output="dataset.csv",  accession_code=None,protein_length=None, antigen_presenting_protein=None): 
    """
    Reads combined dataset and these columns into existing combined dataset.
    Params: 
        data: Dataset that takes as an input
        output: Save the dataset as .csv format 
        accession_code: A column name of the dataset that contains sequence accession code. 
        protein_lenght: A column name of the dataset that contains the protein sequence length. 
        antigent_presenting_protein: A column name of the dataet that presents the antigent presenting protein MHC-I or MHC-II
    """ 
    col_names = ['allele', 'seq_num', 'start', 'end', 'length', 'peptide', 'core','icore', 'score', 'rank']
    df = pd.read_csv(data, sep='\t', names=col_names) 
    df['accession_code'] = [accession_code] * df.shape[0] 
    df['protein_length'] = [protein_length] * df.shape[0]  
    df['antigen_presenting_protein'] = [antigen_presenting_protein] * df.shape[0] 
    df.drop(['allele','seq_num', 'core','icore','rank'], axis=1,inplace=True)
    return df.to_csv(output, index=False)

In [68]:
# 1: MN959467
readData('../mhc-i/MN959467.txt',accession_code='MN959467', protein_length=495, 
         antigen_presenting_protein='MHC-I', output='../cleaned_data/mhc-i/MN959467.csv')

In [69]:
# 2: MN959468
readData('../mhc-i/MN959468.txt',accession_code='MN959468', protein_length=495, 
         antigen_presenting_protein='MHC-I', output='../cleaned_data/mhc-i/MN959468.csv')

In [70]:
# 3: MN959469
readData('../mhc-i/MN959469.txt',accession_code='MN959469', protein_length=495, 
         antigen_presenting_protein='MHC-I', output='../cleaned_data/mhc-i/MN959469.csv')

In [71]:
# 4: MT862865
readData('../mhc-i/MT862865.txt',accession_code='MT862865', protein_length=495, 
         antigen_presenting_protein='MHC-I', output='../cleaned_data/mhc-i/MT862865.csv')

In [72]:
# 5: MT862866
readData('../mhc-i/MT862866.txt',accession_code='MT862866', protein_length=495, 
         antigen_presenting_protein='MHC-I', output='../cleaned_data/mhc-i/MT862866.csv')

In [73]:
# 6: MT862867
readData('../mhc-i/MT862867.txt',accession_code='MT862867', protein_length=495, 
         antigen_presenting_protein='MHC-I', output='../cleaned_data/mhc-i/MT862867.csv')

In [74]:
# 7: MT862868
readData('../mhc-i/MT862868.txt',accession_code='MT862868', protein_length=495, 
         antigen_presenting_protein='MHC-I', output='../cleaned_data/mhc-i/MT862868.csv')

In [75]:
# 8: MT862869
readData('../mhc-i/MT862869.txt',accession_code='MT862869', protein_length=495, 
         antigen_presenting_protein='MHC-I', output='../cleaned_data/mhc-i/MT862869.csv')

In [76]:
# 9: MT862869
readData('../mhc-i/MT862870.txt',accession_code='MT862870', protein_length=495, 
         antigen_presenting_protein='MHC-I', output='../cleaned_data/mhc-i/MT862870.csv')

## Read multiple `csv` from multiple folder.

In [85]:
def readData(data, output="dataset.csv",  accession_code=None,protein_length=None, antigen_presenting_protein=None): 
    """
    Reads combined dataset and these columns into existing combined dataset.
    Params: 
        data: Dataset that takes as an input
        output: Save the dataset as .csv format 
        accession_code: A column name of the dataset that contains sequence accession code. 
        protein_lenght: A column name of the dataset that contains the protein sequence length. 
        antigent_presenting_protein: A column name of the dataet that presents the antigent presenting protein MHC-I or MHC-II
    """ 
    col_names = ['allele', 'seq_num', 'start', 'end', 'length', 'peptide', 'core','icore', 'score', 'rank']
    df = dd.read_csv(data) 
    df = df.compute()
    df['accession_code'] = [accession_code] * df.shape[0] 
    df['protein_length'] = [protein_length] * df.shape[0]  
    df['antigen_presenting_protein'] = [antigen_presenting_protein] * df.shape[0] 
    df.drop(['allele','seq_num', 'core','icore','rank'], axis=1,inplace=True)
    return df.to_csv(output, index=False)