# CHAMP Mutation list

This code read the CHAMP Mutation List excel file from CDC and clean it to be easier to manipulate and with less errors. 

In [1527]:
import re
import pandas as pd
import numpy as np

In [1528]:
# read sheet from excel that has list off mutations from hemophilia A
file="champ-mutation-list-q4.xlsx"
sheet="CHAMP Mutation List"
df = pd.read_excel(file, sheet)

In [1529]:
# clean the columns to increase readability
df.columns = [c.replace('\n', ' ').replace('  ',' ').strip() for c in df.columns]

In [1530]:
# trim all values that are not NaN to facilitate manipulation
df = df.applymap(lambda x: str(x).strip() if x is not np.nan else x)

In [1531]:
# function to apply regex search and return NaN if dont find it
def re_search(regex, row):
    r = re.search(regex, row)
    return np.nan if r is None else r.group()[2:]

In [1532]:
# define invalid values as NaN from HGVS Protein column 
regex = "(^p.[A-Z]{1}[a-z]{2}\d*(\*{1}|[A-Z]{1}[a-z]{2})$)"
df["HGVS Protein"] = df["HGVS Protein"].apply(lambda s: re_search(regex, str(s)))
# define invalid values as NaN from Reported Severity column
df["Reported Severity"] = df["Reported Severity"].apply(lambda s: np.nan if '/' in s else s)
# define invalid values as NaN from Codon column
df["Codon"] = df["Codon"].apply(lambda s: np.nan if '-' in str(s) else s)

In [1533]:
# list of columns filtered to use dropna method
cols = list(df.columns)
cols_remove = ['Severe (<1 U/dL)', 'Moderate (1-5 U/dL)', 'Mild (>5 U/dL)', 'Unclassified (no FVIII level)', 'Comments']
for c in cols_remove:
    cols.remove(c)
# drop rows with NaN based on above column list
df.dropna(inplace=True, subset=cols)

In [1535]:
# list of columns filtered to use title method from string type
cols = list(df.columns)
cols_remove = ['HGVS Protein', 'Mature Protein', 'Domain']
for c in cols_remove:
    cols.remove(c)
# capitilize strings based on above column list
df[cols] = df[cols].applymap(lambda x: x.title() if isinstance(x, str) else x)

In [1536]:
# sort dataframe using Codon column as reference
df['Codon'] = df['Codon'].astype('int')
df.sort_values(inplace=True, by='Codon')

In [1537]:
# reset index because the dropped rows
df.reset_index(inplace=True, drop=True)

In [1538]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2063 entries, 0 to 2062
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   HGVS cDNA                      2063 non-null   object
 1   hg19 Coordinates               2063 non-null   object
 2   HGVS Protein                   2063 non-null   object
 3   Mature Protein                 2063 non-null   object
 4   Mutation Type                  2063 non-null   object
 5   Mechanism                      2063 non-null   object
 6   Exon                           2063 non-null   object
 7   Codon                          2063 non-null   int64 
 8   Domain                         2063 non-null   object
 9   Subtype                        2063 non-null   object
 10  In Poly A                      2063 non-null   object
 11  Severe (<1 U/dL)               803 non-null    object
 12  Moderate (1-5 U/dL)            329 non-null    object
 13  Mil

In [1539]:
# write the cleaned dataframe in a excel file
df.to_excel(f"{file[:-5]}-clean.xlsx")