In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Getting FIFA Countries Codes, wiki scrapper

In [3]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_FIFA_country_codes"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)


200


In [4]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
# fifa_codes=soup.find('table',{'class':"wikitable"})
df1 = soup.find_all('table')[0]
df2 = soup.find_all('table')[1]
df3 = soup.find_all('table')[2]
df4 = soup.find_all('table')[3]

In [5]:
codes1=pd.read_html(str(df1))
# convert list to dataframe
codes1=pd.DataFrame(codes1[0])

codes2=pd.read_html(str(df2))
# convert list to dataframe
codes2=pd.DataFrame(codes2[0])

codes3=pd.read_html(str(df3))
# convert list to dataframe
codes3=pd.DataFrame(codes3[0])

codes4=pd.read_html(str(df4))
# convert list to dataframe
codes4=pd.DataFrame(codes4[0])
# print(codes1.head())

In [6]:
frames = [codes1, codes2, codes3, codes4]
country_codes = pd.concat(frames)
country_codes.rename(columns={'Country': 'team', 'Code': 'code'}, inplace=True)
country_codes['country'] = country_codes.loc[:, 'team']
country_codes

Unnamed: 0,team,code,country
0,Afghanistan,AFG,Afghanistan
1,Albania,ALB,Albania
2,Algeria,ALG,Algeria
3,American Samoa,ASA,American Samoa
4,Andorra,AND,Andorra
...,...,...,...
47,Vietnam,VIE,Vietnam
48,Wales,WAL,Wales
49,Yemen,YEM,Yemen
50,Zambia,ZAM,Zambia


## Getting Sovereign States

In [7]:
# get the response in the form of html
wikiurl2="https://en.wikipedia.org/wiki/List_of_sovereign_states_by_date_of_formation"
table_class2="wikitable sortable tpl-blanktable jquery-tablesorter"
response2=requests.get(wikiurl2)
print(response2.status_code)

200


In [8]:
# parse data from the html into a beautifulsoup object
soup_states = BeautifulSoup(response2.text, 'html.parser')

In [9]:

len(soup_states.find_all('table'))

9

In [10]:
# It is the last table on page
states = soup_states.find_all('table')[7]

In [11]:
sovereign_states=pd.read_html(str(states))
# convert list to dataframe
sovereign_states=pd.DataFrame(sovereign_states[0])

In [12]:
sovereign_states.head()

Unnamed: 0,Country,Continent,First acquisition of sovereignty,Date of last subordination,Previous governing power,Historical Notes
0,Afghanistan,Asia,1709,1796,Afsharid Empire,2021–present: Islamic Emirate of Afghanistan 2...
1,Albania,Europe,28 Nov 1912,Nov 1944[44],Germany,1992–present: Republic of Albania 1946–92: Peo...
2,Algeria,Africa,202 BC,3 July 1962[46],France,1962–present: People's Democratic Republic of ...
3,Andorra,Europe,7 Sep 1278[48][49],Nov 1944[50],France,1278–present: Principality of Andorra (via Par...
4,Angola,Africa,11 Nov 1975[51],11 Nov 1975,Portugal,1992–present: Republic of Angola 1975–1992: Pe...


In [13]:
sovereign_states = sovereign_states[['Country', 'Continent']]
sovereign_states.columns = sovereign_states.columns.str.lower()
sovereign_states

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa
...,...,...
190,Venezuela,The Americas
191,Vietnam,Asia
192,Yemen,Asia
193,Zambia,Africa


## Connecting Sovereing States to FIFA teams and codes

In [14]:
# Teams in country_codes (fifa teams), but not in sovereign states countries (because they are not really countries)
print(list(set(country_codes['country'].unique()) - set(sovereign_states['country'].unique())))

['American Samoa', 'Hong Kong', 'Faroe Islands', 'Bermuda', 'Turks and Caicos Islands', 'Curaçao', 'Cook Islands', 'Montserrat', 'DR Congo', 'Aruba', 'Republic of Ireland', 'Anguilla', 'Tahiti', 'Congo', 'British Virgin Islands', 'Cayman Islands', 'Northern Ireland', 'Czech Republic', 'Guam', 'Wales', 'Macau', 'Scotland', 'Puerto Rico', 'Gibraltar', 'England', 'New Caledonia', 'Kosovo', 'Chinese Taipei', 'U.S. Virgin Islands']


In [15]:
# Countries that are in sovereign states, but not in fifa teams 
print(list(set(sovereign_states['country'].unique()) - set(country_codes['country'].unique())))

['Tuvalu', 'Ireland', 'Micronesia, Federated States of', 'Monaco', 'Congo, Democratic Republic of the', 'Czechia', 'Marshall Islands', 'United Kingdom', 'Nauru', 'Vatican City', 'Congo, Republic of the', 'Palau', 'Kiribati']


In [16]:
# Change a few names in country_codes in order to match teams and their country
team_to_change = ['Chinese Taipei', 'Congo', 'England', 'Wales', 'Northern Ireland', 'Scotland' ]
changes = ['Taiwan', 'Republic of the Congo', 'United Kingdom', 'United Kingdom', 'United Kingdom', 'United Kingdom']


In [17]:
country_codes['country'] = country_codes['country'].replace(team_to_change,changes)

In [18]:
country_codes

Unnamed: 0,team,code,country
0,Afghanistan,AFG,Afghanistan
1,Albania,ALB,Albania
2,Algeria,ALG,Algeria
3,American Samoa,ASA,American Samoa
4,Andorra,AND,Andorra
...,...,...,...
47,Vietnam,VIE,Vietnam
48,Wales,WAL,United Kingdom
49,Yemen,YEM,Yemen
50,Zambia,ZAM,Zambia


In [19]:
country_codes.to_csv('./data/country_codes.csv', index=False)