# MarkerDB

Original data was pulled from [here](https://markerdb.ca/downloads) (genetics) on 2/14/24. 

In [13]:
import pandas as pd
import table_cleaning_functions as tcf 

# read in markerdb data 
markerdb_raw = pd.read_csv('../home/data/raw_data/GlyGen/markerdb_raw.tsv', sep = '\t', header = None)

In [14]:
print(markerdb_raw.shape)
markerdb_raw.head()

(11622, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,14030,LRP1,rs1385526,Familial Abdominal Aortic Aneurysm,Pathogenic,,,,
1,14031,ERG,rs2836411,Familial Abdominal Aortic Aneurysm,Pathogenic,,,,
2,14032,ERG,rs602633,Familial Abdominal Aortic Aneurysm,Pathogenic,,,,
3,14035,TBXAS1,rs17837497,Acute Lymphoblastic Leukemia,Pathogenic,,,,
4,14036,TBXAS1,rs17079534,Acute Lymphoblastic Leukemia,Pathogenic,,,,


Add in column headers. 

In [15]:
cols = ['markerdb_id', 'gene', 'rs_id', 'disease', 'other', 'misc', 'misc1', 'misc2', 'misc3']
markerdb_raw.columns = cols
markerdb_raw.head()

Unnamed: 0,markerdb_id,gene,rs_id,disease,other,misc,misc1,misc2,misc3
0,14030,LRP1,rs1385526,Familial Abdominal Aortic Aneurysm,Pathogenic,,,,
1,14031,ERG,rs2836411,Familial Abdominal Aortic Aneurysm,Pathogenic,,,,
2,14032,ERG,rs602633,Familial Abdominal Aortic Aneurysm,Pathogenic,,,,
3,14035,TBXAS1,rs17837497,Acute Lymphoblastic Leukemia,Pathogenic,,,,
4,14036,TBXAS1,rs17079534,Acute Lymphoblastic Leukemia,Pathogenic,,,,


Check for null values.

In [17]:
print(markerdb_raw.shape)
markerdb_raw.isnull().sum()

(11622, 9)


markerdb_id        0
gene               0
rs_id              0
disease            0
other              2
misc           11622
misc1          11622
misc2          11622
misc3          11622
dtype: int64

Drop unnecessary columns. 

In [18]:
markerdb_raw = markerdb_raw.drop(columns = ['other', 'misc', 'misc1', 'misc2', 'misc3'])
markerdb_raw.head()

Unnamed: 0,markerdb_id,gene,rs_id,disease
0,14030,LRP1,rs1385526,Familial Abdominal Aortic Aneurysm
1,14031,ERG,rs2836411,Familial Abdominal Aortic Aneurysm
2,14032,ERG,rs602633,Familial Abdominal Aortic Aneurysm
3,14035,TBXAS1,rs17837497,Acute Lymphoblastic Leukemia
4,14036,TBXAS1,rs17079534,Acute Lymphoblastic Leukemia


Only interested in cancer related rows right now, filter on cancer related conditions.

In [19]:
# cancer related words
cancer_related = ['cancer', 'carcinoma', 'leukemia', 'tumor', 'malignancy', 'glioblastoma',
                'melanoma', 'lymphoma', 'sarcoma']

markerdb_raw = markerdb_raw[markerdb_raw['disease'].str.contains('|'.join(cancer_related), case = False, na = False)]
print(markerdb_raw.shape)
markerdb_raw.head()

(9270, 4)


Unnamed: 0,markerdb_id,gene,rs_id,disease
3,14035,TBXAS1,rs17837497,Acute Lymphoblastic Leukemia
4,14036,TBXAS1,rs17079534,Acute Lymphoblastic Leukemia
5,14037,MAGI2,rs1496766,Acute Lymphoblastic Leukemia
6,14038,PDE4B,rs6683977,Acute Lymphoblastic Leukemia
7,14039,PYGL,rs7142143,Acute Lymphoblastic Leukemia


Recheck for null values.

In [20]:
markerdb_raw.isnull().sum()

markerdb_id    0
gene           0
rs_id          0
disease        0
dtype: int64

Save processed data and isolate conditions to clean manually. 

In [25]:
markerdb_raw.to_csv('../home/data/processed_data/GlyGen/markerdb/markerdb_processed.tsv', sep = '\t', index = False)

In [27]:
conditions = markerdb_raw['disease']
conditions = conditions.drop_duplicates()
conditions.to_csv('../home/data/processed_data/GlyGen/markerdb/markerdb_conditions.tsv', sep = '\t', index = False)

In [28]:
markerdb_raw['disease'].value_counts()

disease
Familial Breast Ovarian Cancer                                   5827
Breast Cancer                                                    1467
Breast Ovarian Cancer                                            1415
Colon Cancer                                                      111
Hereditary Diffuse Gastric Cancer                                  89
Ovarian Cancer                                                     47
PTEN Hamartoma Tumor Syndrome                                      42
Pancreatic Cancer                                                  40
Familial Platelet Disorder With Associated Myeloid Malignancy      38
Prostate Cancer                                                    31
Colorectal Cancer                                                  24
Wilms Tumor 1                                                      20
Somatic Adenocarcinoma of the Lung                                 17
Melanoma                                                           16
Small Cell C