In [3]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen

# Need to inspect source of webpage to find the thing we want

In [5]:
wiki_url = "https://en.wikipedia.org/wiki/Genome"
wiki_data = urlopen(wiki_url)
wiki_html = wiki_data.read()
wiki_data.close()

page_soup = soup(wiki_html, "html.parser")

In [6]:
print(page_soup)

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-feature-night-mode-disabled skin-night-mode-clientpref-0 vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Genome - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-featu

In [7]:
page_soup.h1

<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Genome</span></h1>

In [9]:
genome_table = page_soup.findAll('table', {'class':'wikitable sortable'}) # we get the class name from the page inspection
# Results outputted in an array

In [10]:
len(genome_table)       # shows there's only 1 table present in the array

1

In [11]:
genome_table = genome_table[0]
headers = genome_table.findAll('th', {})
headers

[<th>Organism type
 </th>,
 <th>Organism
 </th>,
 <th colspan="2">Genome size <br/>(<a href="/wiki/Base_pair" title="Base pair">base pairs</a>)
 </th>,
 <th>Approx. no. of genes
 </th>,
 <th class="unsortable">Note
 </th>]

In [12]:
# Want to create an list of header titles to begin tidying up
header_titles = []
for header in headers:
    header_titles.append(header.text)

header_titles

['Organism type\n',
 'Organism\n',
 'Genome size (base pairs)\n',
 'Approx. no. of genes\n',
 'Note\n']

In [13]:
# Want to cut off the last two characters of each line to remove the \n

header_titles = []
for header in headers:
    header_titles.append(header.text[:-1])

header_titles

['Organism type',
 'Organism',
 'Genome size (base pairs)',
 'Approx. no. of genes',
 'Note']

In [14]:
# Want to get the table rows
# Need to omit first row as it will contain the headers which we already have

all_rows = genome_table.findAll('tr', {})
all_rows

[<tr>
 <th>Organism type
 </th>
 <th>Organism
 </th>
 <th colspan="2">Genome size <br/>(<a href="/wiki/Base_pair" title="Base pair">base pairs</a>)
 </th>
 <th>Approx. no. of genes
 </th>
 <th class="unsortable">Note
 </th></tr>,
 <tr>
 <td><a href="/wiki/Virus" title="Virus">Virus</a>
 </td>
 <td><a href="/wiki/Porcine_circovirus" title="Porcine circovirus">Porcine circovirus</a> type 1
 </td>
 <td align="right">1,759
 </td>
 <td>1.8 kB
 </td>
 <td>
 </td>
 <td>Smallest viruses replicating autonomously in <a class="mw-redirect" href="/wiki/Eukaryotic" title="Eukaryotic">eukaryotic</a> cells<sup class="reference" id="cite_ref-Equinexus_57-0"><a href="#cite_note-Equinexus-57">[57]</a></sup>
 </td></tr>,
 <tr>
 <td><a href="/wiki/Virus" title="Virus">Virus</a>
 </td>
 <td><a href="/wiki/Bacteriophage_MS2" title="Bacteriophage MS2">Bacteriophage MS2</a>
 </td>
 <td align="right">3,569
 </td>
 <td>3.6 kB
 </td>
 <td>
 </td>
 <td>First sequenced RNA-genome<sup class="reference" id="cite_ref

In [23]:
data = all_rows[1:]
data

[<tr>
 <td><a href="/wiki/Virus" title="Virus">Virus</a>
 </td>
 <td><a href="/wiki/Porcine_circovirus" title="Porcine circovirus">Porcine circovirus</a> type 1
 </td>
 <td align="right">1,759
 </td>
 <td>1.8 kB
 </td>
 <td>
 </td>
 <td>Smallest viruses replicating autonomously in <a class="mw-redirect" href="/wiki/Eukaryotic" title="Eukaryotic">eukaryotic</a> cells<sup class="reference" id="cite_ref-Equinexus_57-0"><a href="#cite_note-Equinexus-57">[57]</a></sup>
 </td></tr>,
 <tr>
 <td><a href="/wiki/Virus" title="Virus">Virus</a>
 </td>
 <td><a href="/wiki/Bacteriophage_MS2" title="Bacteriophage MS2">Bacteriophage MS2</a>
 </td>
 <td align="right">3,569
 </td>
 <td>3.6 kB
 </td>
 <td>
 </td>
 <td>First sequenced RNA-genome<sup class="reference" id="cite_ref-Fiers1976_58-0"><a href="#cite_note-Fiers1976-58">[58]</a></sup>
 </td></tr>,
 <tr>
 <td><a href="/wiki/Virus" title="Virus">Virus</a>
 </td>
 <td><a href="/wiki/SV40" title="SV40">SV40</a>
 </td>
 <td align="right">5,224
 </td>


In [17]:
# Want to identify first row first
# Can then apply a for loop to repeat for the rest of the rows

first_row = data[0]
first_row_data = first_row.findAll('td', {})
first_row_data

[<td><a href="/wiki/Virus" title="Virus">Virus</a>
 </td>,
 <td><a href="/wiki/Porcine_circovirus" title="Porcine circovirus">Porcine circovirus</a> type 1
 </td>,
 <td align="right">1,759
 </td>,
 <td>1.8 kB
 </td>,
 <td>
 </td>,
 <td>Smallest viruses replicating autonomously in <a class="mw-redirect" href="/wiki/Eukaryotic" title="Eukaryotic">eukaryotic</a> cells<sup class="reference" id="cite_ref-Equinexus_57-0"><a href="#cite_note-Equinexus-57">[57]</a></sup>
 </td>]

In [18]:
data_text = []

for data in first_row_data:
    data_text.append(data.text[:-1])        # Keep the index to remove new line syntax

data_text

['Virus',
 'Porcine circovirus type 1',
 '1,759',
 '1.8\xa0kB',
 '',
 'Smallest viruses replicating autonomously in eukaryotic cells[57]']

In [26]:
table_rows = []

for row in data:
    table_row = []
    row_data = row.findAll('td', {})
    for i in row_data:
        table_row.append(i.text[:-1])
    table_rows.append(table_row)

print(len(table_rows))
table_rows

44


[['Virus',
  'Porcine circovirus type 1',
  '1,759',
  '1.8\xa0kB',
  '',
  'Smallest viruses replicating autonomously in eukaryotic cells[57]'],
 ['Virus',
  'Bacteriophage MS2',
  '3,569',
  '3.6\xa0kB',
  '',
  'First sequenced RNA-genome[58]'],
 ['Virus', 'SV40', '5,224', '5.2\xa0kB', '', '[59]'],
 ['Virus',
  'Phage Φ-X174',
  '5,386',
  '5.4\xa0kB',
  '',
  'First sequenced DNA-genome[60]'],
 ['Virus', 'HIV', '9,749', '9.7\xa0kB', '', '[61]'],
 ['Virus',
  'Phage λ',
  '48,502',
  '48.5\xa0kB',
  '',
  'Often used as a vector for the cloning of recombinant DNA\n[62][63][64]\n'],
 ['Virus',
  'Megavirus',
  '1,259,197',
  '1.3\xa0MB',
  '',
  'Until 2013 the largest known viral genome[65]'],
 ['Virus',
  'Pandoravirus salinus',
  '2,470,000',
  '2.47\xa0MB',
  '',
  'Largest known viral genome.[66]'],
 ['Eukaryotic organelle',
  'Human mitochondrion',
  '16,569',
  '16.6\xa0kB',
  '',
  '[67]'],
 ['Bacterium',
  'Nasuia deltocephalinicola (strain NAS-ALF)',
  '112,091',
  '112\xa0

In [38]:
# Now going to focus on writing to a csv

filename = 'genome_table.csv'
f = open(filename, 'w', encoding='utf-8')

# Need to convert each array into one long string, separated by commas

header_string = ''
for title in header_titles:
    header_string += title + ','
header_string = header_string[:-1]
header_string += '\n'

f.write(header_string)

for row in table_rows:
    row_string = ''
    for column in row:
        column_string = column.replace(',', '') # Need to clean up the data - e.g. some rows contain commas
        row_string += column_string + ','
    row_string = row_string[:-1]
    row_string += '\n'
    f.write(row_string)

f.close()

In [41]:
# Now reading in from a local file

filename = 'Genome - Wikipedia.html'
f = open(filename, encoding="utf8")

# not using f.read, as this would return a string

new_soup = soup(f, 'html.parser')
print(new_soup.h1)

<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Genome</span></h1>


In [42]:
print(new_soup.findAll('table', {'class':'wikitable sortable'}))

[<table class="wikitable sortable">
<tbody><tr>
<th>Organism type
</th>
<th>Organism
</th>
<th colspan="2">Genome size <br/>(<a href="/wiki/Base_pair" title="Base pair">base pairs</a>)
</th>
<th>Approx. no. of genes
</th>
<th class="unsortable">Note
</th></tr>
<tr>
<td><a href="/wiki/Virus" title="Virus">Virus</a>
</td>
<td><a href="/wiki/Porcine_circovirus" title="Porcine circovirus">Porcine circovirus</a> type 1
</td>
<td align="right">1,759
</td>
<td>1.8 kB
</td>
<td>
</td>
<td>Smallest viruses replicating autonomously in <a class="mw-redirect" href="/wiki/Eukaryotic" title="Eukaryotic">eukaryotic</a> cells<sup class="reference" id="cite_ref-Equinexus_57-0"><a href="#cite_note-Equinexus-57">[57]</a></sup>
</td></tr>
<tr>
<td><a href="/wiki/Virus" title="Virus">Virus</a>
</td>
<td><a href="/wiki/Bacteriophage_MS2" title="Bacteriophage MS2">Bacteriophage MS2</a>
</td>
<td align="right">3,569
</td>
<td>3.6 kB
</td>
<td>
</td>
<td>First sequenced RNA-genome<sup class="reference" id="cit

In [None]:
# Can then just repeat the previous steps to parse and handle the data

In [45]:
# Now want to look at reading from non-table data
# Going to pull the references from the Genome Wikipedia page

# print(page_soup) # still working and contains data from the whole page

references_list_raw = page_soup.findAll('ol', {'class':'references'})
references_list_raw

[<ol class="references">
 <li id="cite_note-Roth_p.-1"><span class="mw-cite-backlink"><b><a href="#cite_ref-Roth_p._1-0">^</a></b></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1133582631">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free a,.mw-parser-output .citation .cs1-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited a,.mw-parser-output .id-lock-registration a,.mw-parser-output .citation .cs1-lock-limited a,.mw-parser-output .citation .cs1-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription a,.mw-parser-output .citation .cs1-lock-subscription 

In [48]:
references_list = references_list_raw[0].findAll('li', {})        # li means list elements

all_references = []

for item in references_list:
    references = []
    for reference in item.findAll('a', {}):
        references.append(reference['href'])            
    all_references.append(references)

all_references

[['#cite_ref-Roth_p._1-0',
  'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6579593',
  '/wiki/Doi_(identifier)',
  'https://doi.org/10.5195%2Fjmla.2019.604',
  '/wiki/ISSN_(identifier)',
  'https://www.worldcat.org/issn/1558-9439',
  '/wiki/PMC_(identifier)',
  'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6579593',
  '/wiki/PMID_(identifier)',
  'https://pubmed.ncbi.nlm.nih.gov/31258451'],
 ['#cite_ref-Graur_2-0',
  '#cite_ref-Graur_2-1',
  '#cite_ref-Graur_2-2',
  'https://books.google.com/books?id=blOZjgEACAAJ',
  '/wiki/ISBN_(identifier)',
  '/wiki/Special:BookSources/9781605354699',
  '/wiki/OCLC_(identifier)',
  'https://www.worldcat.org/oclc/951474209'],
 ['#cite_ref-3',
  'https://nyaspubs.onlinelibrary.wiley.com/doi/10.1111/j.1749-6632.2009.05004.x',
  '/wiki/Bibcode_(identifier)',
  'https://ui.adsabs.harvard.edu/abs/2009NYASA1178..186B',
  '/wiki/Doi_(identifier)',
  'https://doi.org/10.1111%2Fj.1749-6632.2009.05004.x',
  '/wiki/PMID_(identifier)',
  'https://pubmed.ncbi.nlm