In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests

In [2]:
#Prepare scraping session
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [13]:
#main link
wiki_link = 'https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature'

In [14]:
page_content = session.get(wiki_link)

In [15]:
page_content

<Response [200]>

In [16]:
page_content_parser = BeautifulSoup(page_content.text, 'html.parser')


In [17]:

def find_laureates_tag(tag):
    return tag.has_attr('id') and tag['id'] == 'Laureates'

In [18]:
#Get the tag which belongs to the title of the section
laureates_tag = page_content_parser.find(find_laureates_tag)

In [19]:
laureates_tag


<span class="mw-headline" id="Laureates">Laureates</span>

In [20]:
tag = laureates_tag.parent #move to header

In [21]:
while tag.name != 'table':
    tag = tag.next_sibling

In [22]:
tag.name

'table'

In [23]:
#Scrap the table
def parse_the_row(cells, writer_info, same_year, mem_year):
    start_index = 0
    if same_year > 0:
        writer_info['year'].append(mem_year)
        same_year -= 1   
        start_index = 1
    elif cells[0].has_attr('rowspan'):
        same_year = int(cells[0]['rowspan']) - 1
        mem_year = cells[0].text
        writer_info['year'].append(mem_year)
    else:
        writer_info['year'].append(cells[0].text)
        
    if len(cells) == 2:
        writer_info['name'].append(cells[1].text)
        writer_info['link'].append(np.nan)
        writer_info['country'].append(np.nan)
        writer_info['language'].append(np.nan)
        writer_info['citation'].append(np.nan)
        writer_info['genres'].append(np.nan)
    else:
        writer_info['name'].append(cells[2 - start_index].text)
        writer_info['link'].append(cells[2 - start_index].a['href'])
        writer_info['country'].append(cells[3 - start_index].text)
        writer_info['language'].append(cells[4 - start_index].text)
        writer_info['citation'].append(cells[5 - start_index].text)
        writer_info['genres'].append(cells[6 - start_index].text)
    return same_year, mem_year

In [24]:

same_year = 0
mem_year = ""

In [25]:
writer_info = {
    'year' : [],
    'name' : [],
    'country' : [],
    'language' : [],
    'citation' : [],
    'genres' : [],
    'link' : []
}

In [26]:
nobel_table = tag.tbody.children

In [27]:
for row in nobel_table:
    if (row.name):
        cells = row.find_all('td')

        if len(cells) > 0:
            same_year, mem_year = parse_the_row(cells, writer_info, same_year, mem_year)

In [28]:
writers_table = pd.DataFrame(writer_info, columns = [x for x in writer_info])

In [29]:
writers_table

Unnamed: 0,year,name,country,language,citation,genres,link
0,1901\n,Sully Prudhomme\n,France\n,French\n,"""in special recognition of his poetic composit...","poetry, essay\n",/wiki/Sully_Prudhomme
1,1902\n,Theodor Mommsen\n,Germany\n,German\n,"""the greatest living master of the art of hist...","history, law\n",/wiki/Theodor_Mommsen
2,1903\n,Bjørnstjerne Bjørnson\n,Norway\n,Norwegian\n,"""as a tribute to his noble, magnificent and ve...","poetry, novel, drama\n",/wiki/Bj%C3%B8rnstjerne_Bj%C3%B8rnson
3,1904\n,Frédéric Mistral\n,France\n,Provençal\n,"""in recognition of the fresh originality and t...","poetry, philology\n",/wiki/Fr%C3%A9d%C3%A9ric_Mistral
4,1904\n,José Echegaray\n,Spain\n,Spanish\n,"""in recognition of the numerous and brilliant ...",drama\n,/wiki/Jos%C3%A9_Echegaray
...,...,...,...,...,...,...,...
118,2015\n,Svetlana Alexievich\n,Belarus (Born in the Soviet Union)\n,Russian\n,"""for her polyphonic writings, a monument to su...","history, essay\n",/wiki/Svetlana_Alexievich
119,2016\n,Bob Dylan\n,United States\n,English\n,"""for having created new poetic expressions wit...","poetry, songwriting\n",/wiki/Bob_Dylan
120,2017\n,Kazuo Ishiguro\n,United Kingdom (born in Japan)\n,English\n,"""who, in novels of great emotional force, has ...",novel\n,/wiki/Kazuo_Ishiguro
121,2018 (awarded 2019)\n,Olga Tokarczuk\n,Poland\n,Polish\n,“for a narrative imagination that with encyclo...,"novel, short story, poetry, essay\n",/wiki/Olga_Tokarczuk


In [30]:
writers_table['born'] = np.nan
writers_table['died'] = np.nan
writers_table['occupation'] = np.nan

In [31]:
def parse_additional_page(link, year):
    add_content = session.get(link)
    add_content_parser = BeautifulSoup(add_content.text, 'html.parser')
    
    add_table = add_content_parser.find('table', attrs={'class' : 'infobox vcard'})
    if not add_table:
        add_table = add_content_parser.find('table', attrs={'class' : 'infobox biography vcard'})
    if (not add_table):
        return np.nan, np.nan, np.nan
    born, died, occupation = np.nan, np.nan, np.nan
    for row in add_table.tbody.children:
        if row.th and row.th.text == 'Born':
            born = ';'.join(list(row.td.strings))
        elif row.th and row.th.text == 'Died':
            died = ';'.join(list(row.td.strings))
        elif row.th and row.th.text == 'Occupation':
            occupation = ';'.join(list(row.td.strings))
    return born, died, occupation

In [32]:
for i in range(0, writers_table.shape[0]):
    sublink = writers_table.iloc[i]['link']
    if not pd.isna(sublink):
        link = f'https://en.wikipedia.org{sublink}'
        writers_table.iloc[i, [-3,-2,-1]] = parse_additional_page(link, writers_table.iloc[i]['year'])

In [33]:
writers_table.head()

Unnamed: 0,year,name,country,language,citation,genres,link,born,died,occupation
0,1901\n,Sully Prudhomme\n,France\n,French\n,"""in special recognition of his poetic composit...","poetry, essay\n",/wiki/Sully_Prudhomme,René François Armand Prudhomme;(;1839-03-16;);...,6 September 1907;(1907-09-06); (aged 68);Châte...,Poet and essayist
1,1902\n,Theodor Mommsen\n,Germany\n,German\n,"""the greatest living master of the art of hist...","history, law\n",/wiki/Theodor_Mommsen,Christian Matthias Theodor Mommsen;(;1817-11-3...,1 November 1903;(1903-11-01); (aged 85);Charlo...,
2,1903\n,Bjørnstjerne Bjørnson\n,Norway\n,Norwegian\n,"""as a tribute to his noble, magnificent and ve...","poetry, novel, drama\n",/wiki/Bj%C3%B8rnstjerne_Bj%C3%B8rnson,Bjørnstjerne Martinius Bjørnson;(;1832-12-08;)...,"26 April 1910;(1910-04-26); (aged 77);Paris;, ...","Poet, novelist, playwright, lyricist"
3,1904\n,Frédéric Mistral\n,France\n,Provençal\n,"""in recognition of the fresh originality and t...","poetry, philology\n",/wiki/Fr%C3%A9d%C3%A9ric_Mistral,"(;1830-09-08;);8 September 1830;Maillane;, ;Fr...",25 March 1914;(1914-03-25); (aged 83);Maillane...,Poet
4,1904\n,José Echegaray\n,Spain\n,Spanish\n,"""in recognition of the numerous and brilliant ...",drama\n,/wiki/Jos%C3%A9_Echegaray,José Echegaray y Eizaguirre;(;1832-04-19;);19 ...,14 September 1916;(1916-09-14); (aged 84);Madr...,"Dramatist, civil engineer and mathematician"


In [36]:
writers_table = writers_table.replace('\n','', regex=True)

In [37]:
writers_table.head()

Unnamed: 0,year,name,country,language,citation,genres,link,born,died,occupation
0,1901,Sully Prudhomme,France,French,"""in special recognition of his poetic composit...","poetry, essay",/wiki/Sully_Prudhomme,René François Armand Prudhomme;(;1839-03-16;);...,6 September 1907;(1907-09-06); (aged 68);Châte...,Poet and essayist
1,1902,Theodor Mommsen,Germany,German,"""the greatest living master of the art of hist...","history, law",/wiki/Theodor_Mommsen,Christian Matthias Theodor Mommsen;(;1817-11-3...,1 November 1903;(1903-11-01); (aged 85);Charlo...,
2,1903,Bjørnstjerne Bjørnson,Norway,Norwegian,"""as a tribute to his noble, magnificent and ve...","poetry, novel, drama",/wiki/Bj%C3%B8rnstjerne_Bj%C3%B8rnson,Bjørnstjerne Martinius Bjørnson;(;1832-12-08;)...,"26 April 1910;(1910-04-26); (aged 77);Paris;, ...","Poet, novelist, playwright, lyricist"
3,1904,Frédéric Mistral,France,Provençal,"""in recognition of the fresh originality and t...","poetry, philology",/wiki/Fr%C3%A9d%C3%A9ric_Mistral,"(;1830-09-08;);8 September 1830;Maillane;, ;Fr...",25 March 1914;(1914-03-25); (aged 83);Maillane...,Poet
4,1904,José Echegaray,Spain,Spanish,"""in recognition of the numerous and brilliant ...",drama,/wiki/Jos%C3%A9_Echegaray,José Echegaray y Eizaguirre;(;1832-04-19;);19 ...,14 September 1916;(1916-09-14); (aged 84);Madr...,"Dramatist, civil engineer and mathematician"
