# Web-Scrapping of Oldest people world

In [255]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [256]:
# get the response from the url
wiki_url=requests.get('https://en.wikipedia.org/wiki/List_of_the_verified_oldest_people')

#it's illegeal to scrape any website,so we need to get the response as 200 that means we can download the data.
print(wiki_url)

<Response [200]>


In [257]:
#getting the html page and creating a BeautifulSoup object
soup=BeautifulSoup(wiki_url.text,"html.parser")
soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of the verified oldest people - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YETnuCV98M-wPE2eucFj3wAAAU0","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_the_verified_oldest_people","wgTitle":"List of the verified oldest people","wgCurRevisionId":1010728595,"wgRevisionId":1010728595,"wgArticleId":9945566,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 French-language sources (fr)","CS1 German-language sources (de)","CS1 Japanese-language sources (ja)","CS1 u

In [366]:
#getting the table
table = soup.find_all('table', attrs={"class":"wikitable"})

#converting table into list
df=pd.read_html(str(table))

#convert list into dataframe
df1=pd.DataFrame(df[0])
df2=pd.DataFrame(df[1])

# Before merging the dataframes, adding Gender column for both the tables
df1['Gender']='F'
df2['Gender']='M'

In [367]:
# Renaming the column 'Name' for df2
df2=df2.rename(columns={"Name[a]":"Name"})
df2.head()

Unnamed: 0,Rank,Name,Birth date,Death date,Age,Place of death or residence,Gender
0,1,Jiroemon Kimura,19 April 1897,12 June 2013,"116 years, 54 days",Japan,M
1,2,Christian Mortensen,16 August 1882,25 April 1998,"115 years, 252 days",United States[b],M
2,3,Emiliano Mercado del Toro,21 August 1891,24 January 2007,"115 years, 156 days",Puerto Rico,M
3,4,Mathew Beard,9 July 1870,16 February 1985,"114 years, 222 days",United States,M
4,5,Walter Breuning,21 September 1896,14 April 2011,"114 years, 205 days",United States,M


In [368]:
#Merging both the dataframes into 1 dataframe (df3)
df3=pd.concat([df1,df2], ignore_index=True)
print(df3.shape)

(201, 7)


In [369]:
# Stripping off the unwated characters from dataframe
df3=df3.applymap(lambda x: str(x).split('[',1)[0])
df3.tail()

Unnamed: 0,Rank,Name,Birth date,Death date,Age,Place of death or residence,Gender
196,96,Daniel Guzmán-García,6 February 1897,21 May 2008,"111 years, 105 days",Colombia,M
197,97,Sadayoshi Tanabe,20 October 1888,18 January 2000,"111 years, 90 days",Japan,M
198,98,Albano Andrade,14 December 1909,Living,"111 years, 83 days",Portugal,M
199,99,Clarence Matthews,1 May 1906,22 July 2017,"111 years, 82 days",United States,M
200,100,Makaru Nakanishi,15 December 1901,28 February 2013,"111 years, 75 days",Japan,M


In [370]:
# adding a column 'Living' and dropping 'Death date' column
df3['Living']=df3.pop('Death date').apply(lambda x: "Yes" if x=="Living" else 'No')


# Spliting 'Age' column in years and days and dropping it
df3[['Age_year','Age_days']] = df3.pop('Age').str.split(',',n=1, expand=True)


# Renaming the column names
df3=df3.rename(columns={'Birth date':'Birth_date',
                        'Place of death or residence':'Place_of_death_or_residence'})

# Converting the 'Birth_date' column into one date format
df3['Birth_date'] = pd.to_datetime(df3['Birth_date'], errors='coerce')


print(df3.shape)
df3.tail(110)



(201, 8)


Unnamed: 0,Rank,Name,Birth_date,Place_of_death_or_residence,Gender,Living,Age_year,Age_days
91,92,Anne Primout,1890-10-05,France,F,No,114 years,172 days
92,93,Ettie Mae Greene,1877-09-08,United States,F,No,114 years,171 days
93,94,Geertje Kuijntjes,1905-07-19,Netherlands,F,No,114 years,158 days
94,95,Thelma Sutcliffe,1906-10-01,United States,F,Yes,114 years,157 days
95,96,Dominga Velasco,1901-05-12,United States,F,No,114 years,152 days
...,...,...,...,...,...,...,...,...
196,96,Daniel Guzmán-García,1897-02-06,Colombia,M,No,111 years,105 days
197,97,Sadayoshi Tanabe,1888-10-20,Japan,M,No,111 years,90 days
198,98,Albano Andrade,1909-12-14,Portugal,M,Yes,111 years,83 days
199,99,Clarence Matthews,1906-05-01,United States,M,No,111 years,82 days


In [371]:
# writing the dataframe to a csv file
df3.to_csv('Oldest_People.csv')