# Web scraping Wikipedia

Obtain the data from the "Full genetic disorders list" table on Wikipedia and store it in a dataframe

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import lxml
import re

obtain and parse HTML file

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_genetic_disorders"
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html, 'lxml')

find and extract "Full genetic disorders list" table

In [3]:
table = soup.find_all('table')[2]

check the header of the table

In [4]:
table.find_all('tr')[0].contents

['\n',
 <th>Disorder
 </th>,
 '\n',
 <th>Chromosome or gene
 </th>,
 '\n',
 <th>Type
 </th>,
 '\n',
 <th>Reference
 </th>,
 '\n',
 <th>Prevalence
 </th>]

prepare lists containing data for disorder name, chromosome or gene location, mutation type, and prevalence from the table

In [5]:
disorders = [row.contents[1].text for row in table.find_all('tr')]
disorders = [disorder.strip('\n') for disorder in disorders]
disorders = disorders[1:]
#disorders

In [6]:
locations = [row.contents[3].text for row in table.find_all('tr')]
locations = [location.strip('\n') for location in locations]
locations = locations[1:]
#locations

In [7]:
types = [row.contents[5].text for row in table.find_all('tr')]
types = [type.strip('\n') for type in types]
types = types[1:]
#types

In [8]:
prevalences = [row.contents[9].text for row in table.find_all('tr')]
prevalences = [prevalence.strip('\n') for prevalence in prevalences]
prevalences = prevalences[1:]
#prevalences

Because the prevalence data is given as ratios and includes ratios, it can be simplified by parsing each string to obtain the minimum possible prevalence as floats.

In [9]:
min_prevalences = []
for prevalence in prevalences:
    if (len(prevalence)):
        split = re.split(':|-|\s|\*|\n|\[', prevalence)
        selection = split[1]
        selection = float(selection.replace(',', ''))
        result = 1/selection
        min_prevalences.append(result)
    else:
        min_prevalences.append('')

Merge the lists together in a dataframe

In [10]:
df = pd.DataFrame(list(zip(disorders, locations, types, prevalences, min_prevalences)), 
columns=['Disorder', "Chromosome or Gene", "Mutation Type", "Prevalence", "Minimum Prevalence"])

Convert empty strings into NaN

In [11]:
df2 = df.replace(r'^\s*$', np.nan, regex=True)

Remove rows with missing data for mutation type or prevalence

In [12]:
df3 = df2.dropna()

Save the data to csv and Excel files

In [13]:
df.to_csv("genetic_disorders.csv", index = False, header = True)
df.to_excel("genetic_disorders.xlsx", index = False, header = True)