# Exploratory Analysis

## This script contains the following points:

### 01. Importing Libraries and Loading Files
### 02. df_results Exploratory Analysis
### 03. df_goalscorers Exploratory Analysis
### 04. df_shootouts Exploratory Analysis

In [8]:
# Importing libraries
import pandas as pd
import numpy as np
import os

# Creating path variable
path = r'C:\Users\widne\Documents\CareerFoundry Exercises\Data_Immersion\Achievement 6\06-2024 International Football Matches'

# Loading cleaned csv files
df_results = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'results_cleaned.csv'), index_col=False)
df_goalscorers = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'goalscorers_cleaned.csv'), index_col=False)
df_shootouts = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'shootouts_cleaned.csv'), index_col=False)

In [12]:
# Dropping Unnamed: 0 columns
df_results = df_results.drop(['Unnamed: 0'], axis=1)
df_goalscorers = df_goalscorers.drop(['Unnamed: 0'], axis = 1)
df_shootouts = df_shootouts.drop(['Unnamed: 0'], axis = 1)

In [13]:
# Checking dataframes
df_results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [14]:
df_goalscorers.head()

Unnamed: 0,date,home_team,away_team,team,scorer,minute,own_goal,penalty
0,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,False
1,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,False
2,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,False
3,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,75.0,False,False
4,1916-07-06,Argentina,Chile,Argentina,Alberto Ohaco,2.0,False,False


In [15]:
df_shootouts.head()

Unnamed: 0,date,home_team,away_team,winner,first_shooter
0,1967-08-22,India,Taiwan,Taiwan,Unknown
1,1971-11-14,South Korea,Vietnam Republic,South Korea,Unknown
2,1972-05-07,South Korea,Iraq,Iraq,Unknown
3,1972-05-17,Thailand,South Korea,South Korea,Unknown
4,1972-05-19,Thailand,Cambodia,Thailand,Unknown


### 02. df_results Exploratory Analysis

In [25]:
# Getting shape of dataset
df_results.shape

(47075, 9)

In [18]:
# Checking data types for df_results columns
df_results.dtypes

date           object
home_team      object
away_team      object
home_score    float64
away_score    float64
tournament     object
city           object
country        object
neutral          bool
dtype: object

In [17]:
# Numerical analysis of quantitative variables in df_results
df_results.describe()

Unnamed: 0,home_score,away_score
count,47075.0,47075.0
mean,1.76119,1.183452
std,1.776751,1.402597
min,0.0,0.0
25%,1.0,0.0
50%,1.0,1.0
75%,2.0,2.0
max,31.0,21.0


In [19]:
# Value counts for home_team column
df_results['home_team'].value_counts()

home_team
Brazil               600
Argentina            585
Mexico               576
South Korea          542
Germany              535
                    ... 
Western Australia      1
Sark                   1
Chechnya               1
Malaya                 1
Hmong                  1
Name: count, Length: 328, dtype: int64

In [20]:
# Value counts for away_team column
df_results['away_team'].value_counts()

away_team
Uruguay      567
Sweden       553
England      529
Hungary      496
Paraguay     478
            ... 
Cilento        1
Malaya         1
Surrey         1
Andalusia      1
Ticino         1
Name: count, Length: 323, dtype: int64

In [21]:
# Value counts for tournament column
df_results['tournament'].value_counts()

tournament
Friendly                                17902
FIFA World Cup qualification             8052
UEFA Euro qualification                  2824
African Cup of Nations qualification     2116
FIFA World Cup                            964
                                        ...  
Copa Confraternidad                         1
Real Madrid 75th Anniversary Cup            1
The Other Final                             1
TIFOCO Tournament                           1
ConIFA Challenger Cup                       1
Name: count, Length: 174, dtype: int64

In [22]:
# Value counts for city column
df_results['city'].value_counts()

city
Kuala Lumpur    724
Bangkok         570
Doha            545
London          426
Budapest        422
               ... 
Gossau            1
Ulm               1
Livorno           1
Tours             1
Chambly           1
Name: count, Length: 2058, dtype: int64

In [23]:
# Value counts for country column
df_results['country'].value_counts()

country
United States          1353
France                  887
Malaysia                812
England                 743
Thailand                699
                       ... 
Belgian Congo             1
Bohemia and Moravia       1
Mali Federation           1
Portuguese Guinea         1
Micronesia                1
Name: count, Length: 271, dtype: int64

In [24]:
# Value counts for neutral column
df_results['neutral'].value_counts()

neutral
False    34716
True     12359
Name: count, dtype: int64

### 03. df_goalscorers Exploratory Analysis

In [26]:
# Getting shape of df_goalscorers dataset
df_goalscorers.shape

(43989, 8)

In [27]:
# Checking data types for df_goalscorers columns
df_goalscorers.dtypes

date          object
home_team     object
away_team     object
team          object
scorer        object
minute       float64
own_goal        bool
penalty         bool
dtype: object

In [28]:
df_goalscorers.describe()

Unnamed: 0,minute
count,43989.0
mean,49.999523
std,26.291436
min,1.0
25%,28.0
50%,51.0
75%,73.0
max,122.0


In [29]:
# Value counts for home_team column
df_goalscorers['home_team'].value_counts()

home_team
Brazil              1021
Argentina            983
Germany              783
Mexico               701
France               664
                    ... 
Somalia                5
Yemen DPR              5
Vietnam Republic       4
South Sudan            4
Saarland               4
Name: count, Length: 220, dtype: int64

In [30]:
# Value counts for away_team column
df_goalscorers['away_team'].value_counts()

away_team
Uruguay          782
Paraguay         656
Spain            565
Germany          562
Peru             561
                ... 
Eritrea           17
Yemen DPR         15
Saarland           8
South Sudan        5
French Guiana      3
Name: count, Length: 220, dtype: int64

In [31]:
# Value counts for team column
df_goalscorers['team'].value_counts()

team
Brazil           1046
Germany           963
Argentina         939
Spain             878
Mexico            843
                 ... 
Eritrea             4
Somalia             3
Anguilla            2
South Sudan         2
French Guiana       2
Name: count, Length: 220, dtype: int64

In [32]:
# Value counts for scorer column
df_goalscorers['scorer'].value_counts()

scorer
Cristiano Ronaldo     108
Robert Lewandowski     61
Romelu Lukaku          60
Harry Kane             55
Lionel Messi           54
                     ... 
Emin Imamaliev          1
David Nugent            1
Craig Beattie           1
Marcel Bossi            1
Jassem Gaber            1
Name: count, Length: 14250, dtype: int64

In [33]:
# Value counts for own_goal column
df_goalscorers['own_goal'].value_counts()

own_goal
False    43184
True       805
Name: count, dtype: int64

In [34]:
# Value counts for penalty column
df_goalscorers['penalty'].value_counts()

penalty
False    41048
True      2941
Name: count, dtype: int64

### 04. df_shootouts Exploratory Analysis

In [35]:
# Getting shape of df_shootouts dataset
df_shootouts.shape

(636, 5)

In [36]:
# Checking data types for df_goalscorers columns
df_shootouts.dtypes

date             object
home_team        object
away_team        object
winner           object
first_shooter    object
dtype: object

In [37]:
# Value counts for home_team column
df_shootouts['home_team'].value_counts()

home_team
South Africa                18
Thailand                    15
Zambia                      15
Iran                        13
Senegal                     13
                            ..
Isle of Man                  1
Saint Lucia                  1
Mayotte                      1
Alderney                     1
Turks and Caicos Islands     1
Name: count, Length: 180, dtype: int64

In [38]:
# Value counts for away_team column
df_shootouts['away_team'].value_counts()

away_team
Egypt                           15
Uganda                          15
South Korea                     13
Argentina                       11
Cameroon                        11
                                ..
Maldives                         1
Slovenia                         1
Haiti                            1
Trinidad and Tobago              1
United States Virgin Islands     1
Name: count, Length: 189, dtype: int64

In [39]:
# Value counts for winner column
df_shootouts['winner'].value_counts()

winner
South Korea    15
Egypt          14
Zambia         14
Argentina      14
Thailand       13
               ..
Sápmi           1
Corsica         1
Menorca         1
Saint Lucia     1
Anguilla        1
Name: count, Length: 174, dtype: int64

In [40]:
# Value counts for first_shooter column
df_shootouts['first_shooter'].value_counts()

first_shooter
Unknown                   414
Colombia                   11
Italy                      10
Brazil                      9
Argentina                   7
                         ... 
Vietnam                     1
Benin                       1
Madagascar                  1
Tanzania                    1
British Virgin Islands      1
Name: count, Length: 86, dtype: int64