In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:

wiki_base_url = 'https://en.wikipedia.org'
wiki_page_url = '/wiki/List_of_Nobel_laureates_in_Literature'

# accessing the website
page = requests.get(f'{wiki_base_url}{wiki_page_url}')
soup = BeautifulSoup(page.content, 'html.parser')

# getting all tables from the website 
tables = soup.find_all("table")
print(f'Numebr of tables on the page:{len(tables)}')


Numebr of tables on the page:10


In [3]:
# We want to scrap the info from the first table
# we seclect it
table = tables[0]

In [4]:
# we get a list of the columns 
columns_names = [x.text for x in table.find_all("th")]
print(columns_names)

['Year\n', 'Picture\n', 'Laureate\n', 'Country\n', 'Language(s)\n', 'Citation\n', 'Genre(s)\n']


In [5]:
# removing new lines from the string using .rstrip()  - method returns a right trim version of the string
columns_names = [x.rstrip() for x in columns_names]
print(columns_names)

['Year', 'Picture', 'Laureate', 'Country', 'Language(s)', 'Citation', 'Genre(s)']


**Getting rows of the data**

In [6]:
raw_rows = table.find("tbody").find_all("tr")
print(' We remove the first item from the list containing the column names that we already have',raw_rows[0].find_all("th"))

 We remove the first item from the list containing the column names that we already have [<th>Year
</th>, <th>Picture
</th>, <th>Laureate
</th>, <th>Country
</th>, <th>Language(s)
</th>, <th>Citation
</th>, <th>Genre(s)
</th>]


In [7]:
# removing the first item from the list
raw_rows = raw_rows[1:]

In [8]:
# in some years we do not have any laureates; we need to identify those rows and ignore them
# or in some years we have 2, not one laureate

data = []
for index_row, item in enumerate(raw_rows):
    row=[]
    line = item.find_all('td')
    if len(line) == 2: continue # when year without laureates do not add a line to data
    for index, td in enumerate(line):
        row.append((td.text).rstrip())
        if index == 2:
            link = td.a['href']   # getting a link to the wiki page from the 'Laureate' column
    if len(line) == 6:   # handling year information for years with 2 laureates
        row.insert(0, year)
        
    row.append(link) # adding link into a row
    
    year = row[0] # saving info about a year; this is used when we have years with 2 laureates
    data.append(row)

In [9]:
# prepering  datastructure for Pandas Dataframe
columns_names.append('Wiki_LINK')
data_for_df = {i:[] for i in columns_names}
print(data_for_df)

{'Year': [], 'Picture': [], 'Laureate': [], 'Country': [], 'Language(s)': [], 'Citation': [], 'Genre(s)': [], 'Wiki_LINK': []}


In [10]:
# moving data from the 2D list to a python dictionary

for row in data:
    for index in range(len(row)):
        data_for_df[columns_names[index]].append(row[index])

In [11]:
# creating dataframe from the dictionary
df = pd.DataFrame(data_for_df)
df.head(2)

Unnamed: 0,Year,Picture,Laureate,Country,Language(s),Citation,Genre(s),Wiki_LINK
0,1901,,Sully Prudhomme (1839–1907),France,French,"""in special recognition of his poetic composit...","poetry, essay",/wiki/Sully_Prudhomme
1,1902,,Theodor Mommsen (1817–1903),Germany,German,"""the greatest living master of the art of hist...","history, law",/wiki/Theodor_Mommsen


In [12]:
# dumping the 'Picture' field
df = df.drop('Picture',axis=1)

In [14]:
df.head(30)

Unnamed: 0,Year,Laureate,Country,Language(s),Citation,Genre(s),Wiki_LINK
0,1901,Sully Prudhomme (1839–1907),France,French,"""in special recognition of his poetic composit...","poetry, essay",/wiki/Sully_Prudhomme
1,1902,Theodor Mommsen (1817–1903),Germany,German,"""the greatest living master of the art of hist...","history, law",/wiki/Theodor_Mommsen
2,1903,Bjørnstjerne Bjørnson (1832–1910),Norway,Norwegian,"""as a tribute to his noble, magnificent and ve...","poetry, novel, drama",/wiki/Bj%C3%B8rnstjerne_Bj%C3%B8rnson
3,1904,Frédéric Mistral (1830–1914),France,Provençal,"""in recognition of the fresh originality and t...","poetry, philology",/wiki/Fr%C3%A9d%C3%A9ric_Mistral
4,1904,José Echegaray (1832–1916),Spain,Spanish,"""in recognition of the numerous and brilliant ...",drama,/wiki/Restoration_(Spain)
5,1905,Henryk Sienkiewicz (1846–1916),Poland( Russian Empire),Polish,"""because of his outstanding merits as an epic ...",novel,/wiki/Henryk_Sienkiewicz
6,1906,Giosuè Carducci (1835–1907),Italy,Italian,"""not only in consideration of his deep learnin...",poetry,/wiki/Giosu%C3%A8_Carducci
7,1907,Rudyard Kipling (1865–1936),United Kingdom,English,"""in consideration of the power of observation,...","novel, short story, poetry",/wiki/Rudyard_Kipling
8,1908,Rudolf Christoph Eucken (1846–1926),Germany,German,"""in recognition of his earnest search for trut...",philosophy,/wiki/Rudolf_Christoph_Eucken
9,1909,Selma Lagerlöf (1858–1940),Sweden,Swedish,"""in appreciation of the lofty idealism, vivid ...","novel, short story",/wiki/Selma_Lagerl%C3%B6f


In [16]:
print(df.loc[40]['Citation'])

"for his inspired writings, which while growing in boldness and penetration, exemplify the classical humanitarian ideals and high qualities of style"[55]
