# 1.Extract-Transform-Load (ETL)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Load Files Into a DataFrame

In [2]:
df = pd.read_csv("./data/euro2012_statistic.csv")
df.head()

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,...,Saves made,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
0,Croatia,4,13,12,51.9%,16.0%,32,0,0,0,...,13,81.3%,41,62,2,9,0,9,9,16
1,Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,...,9,60.1%,53,73,8,7,0,11,11,19
2,Denmark,4,10,10,50.0%,20.0%,27,1,0,0,...,10,66.7%,25,38,8,4,0,7,7,15
3,England,5,11,18,50.0%,17.2%,40,0,0,0,...,22,88.1%,43,45,6,5,0,11,11,16
4,France,3,22,24,37.9%,6.5%,65,1,0,0,...,6,54.6%,36,51,5,6,0,11,11,19


## Shape

In [3]:
df.shape

(16, 35)

## Info About the Data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Team                        16 non-null     object 
 1   Goals                       16 non-null     int64  
 2   Shots on target             16 non-null     int64  
 3   Shots off target            16 non-null     int64  
 4   Shooting Accuracy           16 non-null     object 
 5   % Goals-to-shots            16 non-null     object 
 6   Total shots (inc. Blocked)  16 non-null     int64  
 7   Hit Woodwork                16 non-null     int64  
 8   Penalty goals               16 non-null     int64  
 9   Penalties not scored        16 non-null     int64  
 10  Headed goals                16 non-null     int64  
 11  Passes                      16 non-null     int64  
 12  Passes completed            16 non-null     int64  
 13  Passing Accuracy            16 non-nu

## Checking Columns

In [5]:
df.columns

Index(['Team', 'Goals', 'Shots on target', 'Shots off target',
       'Shooting Accuracy', '% Goals-to-shots', 'Total shots (inc. Blocked)',
       'Hit Woodwork', 'Penalty goals', 'Penalties not scored', 'Headed goals',
       'Passes', 'Passes completed', 'Passing Accuracy', 'Touches', 'Crosses',
       'Dribbles', 'Corners Taken', 'Tackles', 'Clearances', 'Interceptions',
       'Clearances off line', 'Clean Sheets', 'Blocks', 'Goals conceded',
       'Saves made', 'Saves-to-shots ratio', 'Fouls Won', 'Fouls Conceded',
       'Offsides', 'Yellow Cards', 'Red Cards', 'Subs on', 'Subs off',
       'Players Used'],
      dtype='object')

## Checking Null Value

In [6]:
df.isnull().sum()

Team                          0
Goals                         0
Shots on target               0
Shots off target              0
Shooting Accuracy             0
% Goals-to-shots              0
Total shots (inc. Blocked)    0
Hit Woodwork                  0
Penalty goals                 0
Penalties not scored          0
Headed goals                  0
Passes                        0
Passes completed              0
Passing Accuracy              0
Touches                       0
Crosses                       0
Dribbles                      0
Corners Taken                 0
Tackles                       0
Clearances                    0
Interceptions                 0
Clearances off line           1
Clean Sheets                  0
Blocks                        0
Goals conceded                0
Saves made                    0
Saves-to-shots ratio          0
Fouls Won                     0
Fouls Conceded                0
Offsides                      0
Yellow Cards                  0
Red Card

# 2.Exploratory Data Analysis (EDA)

In [7]:
df.describe()

Unnamed: 0,Goals,Shots on target,Shots off target,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,Headed goals,Passes,Passes completed,...,Goals conceded,Saves made,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
count,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,...,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
mean,4.75,17.125,24.9375,54.9375,1.25,0.1875,0.0625,1.375,1763.375,1467.375,...,4.75,12.125,52.9375,55.625,8.125,7.4375,0.1875,10.875,10.875,17.25
std,2.886751,10.582218,10.680005,26.065223,1.612452,0.403113,0.25,1.024695,906.177898,827.580721,...,1.983263,4.573474,22.915697,18.973227,4.910872,3.265348,0.403113,3.53789,3.53789,1.527525
min,1.0,7.0,10.0,27.0,0.0,0.0,0.0,0.0,851.0,606.0,...,1.0,6.0,25.0,30.0,2.0,4.0,0.0,7.0,7.0,15.0
25%,2.75,9.75,18.0,36.5,0.0,0.0,0.0,0.75,1190.75,951.5,...,3.0,9.75,35.75,44.5,4.0,5.0,0.0,8.5,8.5,16.0
50%,4.5,13.0,23.5,44.0,1.0,0.0,0.0,1.5,1522.0,1211.5,...,5.0,11.0,45.5,51.0,7.5,6.5,0.0,10.5,10.5,17.0
75%,5.25,22.0,32.25,68.75,2.0,0.0,0.0,2.0,1934.75,1546.5,...,6.0,13.5,64.0,64.75,11.25,9.0,0.0,12.5,12.5,18.25
max,12.0,42.0,45.0,110.0,6.0,1.0,1.0,3.0,4317.0,3820.0,...,9.0,22.0,102.0,90.0,19.0,16.0,1.0,18.0,18.0,20.0


In [8]:
df.columns

Index(['Team', 'Goals', 'Shots on target', 'Shots off target',
       'Shooting Accuracy', '% Goals-to-shots', 'Total shots (inc. Blocked)',
       'Hit Woodwork', 'Penalty goals', 'Penalties not scored', 'Headed goals',
       'Passes', 'Passes completed', 'Passing Accuracy', 'Touches', 'Crosses',
       'Dribbles', 'Corners Taken', 'Tackles', 'Clearances', 'Interceptions',
       'Clearances off line', 'Clean Sheets', 'Blocks', 'Goals conceded',
       'Saves made', 'Saves-to-shots ratio', 'Fouls Won', 'Fouls Conceded',
       'Offsides', 'Yellow Cards', 'Red Cards', 'Subs on', 'Subs off',
       'Players Used'],
      dtype='object')

In [9]:
#let's check which country get the most red cards and yellow cards
discipline = df[['Team', 'Yellow Cards', 'Red Cards']]
discipline.sort_values(['Red Cards', 'Yellow Cards'], ascending = False)

Unnamed: 0,Team,Yellow Cards,Red Cards
6,Greece,9,1
9,Poland,7,1
11,Republic of Ireland,6,1
7,Italy,16,0
10,Portugal,12,0
13,Spain,11,0
0,Croatia,9,0
1,Czech Republic,7,0
14,Sweden,7,0
4,France,6,0


In [10]:
#check which team score more than 6 goals
df[df.Goals > 6]

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,...,Saves made,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
5,Germany,10,32,32,47.8%,15.6%,80,2,1,0,...,10,62.6%,63,49,12,4,0,15,15,17
13,Spain,12,42,33,55.9%,16.0%,100,0,1,0,...,15,93.8%,102,83,19,11,0,17,17,18


In [11]:
#select all columns except the last 3
#every row, #column until the last three
df.iloc[:, : - 3 ]

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,...,Clean Sheets,Blocks,Goals conceded,Saves made,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards
0,Croatia,4,13,12,51.9%,16.0%,32,0,0,0,...,0,10,3,13,81.3%,41,62,2,9,0
1,Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,...,1,10,6,9,60.1%,53,73,8,7,0
2,Denmark,4,10,10,50.0%,20.0%,27,1,0,0,...,1,10,5,10,66.7%,25,38,8,4,0
3,England,5,11,18,50.0%,17.2%,40,0,0,0,...,2,29,3,22,88.1%,43,45,6,5,0
4,France,3,22,24,37.9%,6.5%,65,1,0,0,...,1,7,5,6,54.6%,36,51,5,6,0
5,Germany,10,32,32,47.8%,15.6%,80,2,1,0,...,1,11,6,10,62.6%,63,49,12,4,0
6,Greece,5,8,18,30.7%,19.2%,32,1,1,1,...,1,23,7,13,65.1%,67,48,12,9,1
7,Italy,6,34,45,43.0%,7.5%,110,2,0,0,...,2,18,7,20,74.1%,101,89,16,16,0
8,Netherlands,2,12,36,25.0%,4.1%,60,2,0,0,...,0,9,5,12,70.6%,35,30,3,5,0
9,Poland,2,15,23,39.4%,5.2%,48,0,0,0,...,0,8,3,6,66.7%,48,56,3,7,1


In [14]:
df['Hit Woodwork']

0     0
1     0
2     1
3     0
4     1
5     2
6     1
7     2
8     2
9     0
10    6
11    0
12    2
13    0
14    3
15    0
Name: Hit Woodwork, dtype: int64

In [16]:
#create new column luck_index which is based on Hit Woodwork.
# pd.cut to create three bins of luck index.
# give labels as High, Medium, Low.
df['luck_index'] = pd.cut(df['Hit Woodwork'], bins= 3, labels=["High", "Medium", "Low"])
df

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,...,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used,luck_index
0,Croatia,4,13,12,51.9%,16.0%,32,0,0,0,...,81.3%,41,62,2,9,0,9,9,16,High
1,Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,...,60.1%,53,73,8,7,0,11,11,19,High
2,Denmark,4,10,10,50.0%,20.0%,27,1,0,0,...,66.7%,25,38,8,4,0,7,7,15,High
3,England,5,11,18,50.0%,17.2%,40,0,0,0,...,88.1%,43,45,6,5,0,11,11,16,High
4,France,3,22,24,37.9%,6.5%,65,1,0,0,...,54.6%,36,51,5,6,0,11,11,19,High
5,Germany,10,32,32,47.8%,15.6%,80,2,1,0,...,62.6%,63,49,12,4,0,15,15,17,High
6,Greece,5,8,18,30.7%,19.2%,32,1,1,1,...,65.1%,67,48,12,9,1,12,12,20,High
7,Italy,6,34,45,43.0%,7.5%,110,2,0,0,...,74.1%,101,89,16,16,0,18,18,19,High
8,Netherlands,2,12,36,25.0%,4.1%,60,2,0,0,...,70.6%,35,30,3,5,0,7,7,15,High
9,Poland,2,15,23,39.4%,5.2%,48,0,0,0,...,66.7%,48,56,3,7,1,7,7,17,High


In [25]:
# create new column "always_lose_goals" which has values of True or False based on clean sheets, if clean sheets is 0, then is True, otherwise False
def no_clean_sheets(x):
    if x == 0:
        return True
    else:
        return False

df['alway_lose_goals'] = df['Clean Sheets'].apply(no_clean_sheets)
df

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,...,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used,luck_index,alway_lose_goals
0,Croatia,4,13,12,51.9%,16.0%,32,0,0,0,...,41,62,2,9,0,9,9,16,High,True
1,Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,...,53,73,8,7,0,11,11,19,High,False
2,Denmark,4,10,10,50.0%,20.0%,27,1,0,0,...,25,38,8,4,0,7,7,15,High,False
3,England,5,11,18,50.0%,17.2%,40,0,0,0,...,43,45,6,5,0,11,11,16,High,False
4,France,3,22,24,37.9%,6.5%,65,1,0,0,...,36,51,5,6,0,11,11,19,High,False
5,Germany,10,32,32,47.8%,15.6%,80,2,1,0,...,63,49,12,4,0,15,15,17,High,False
6,Greece,5,8,18,30.7%,19.2%,32,1,1,1,...,67,48,12,9,1,12,12,20,High,False
7,Italy,6,34,45,43.0%,7.5%,110,2,0,0,...,101,89,16,16,0,18,18,19,High,False
8,Netherlands,2,12,36,25.0%,4.1%,60,2,0,0,...,35,30,3,5,0,7,7,15,High,True
9,Poland,2,15,23,39.4%,5.2%,48,0,0,0,...,48,56,3,7,1,7,7,17,High,True


In [26]:
#let's find out whether our luck_index works.
# group by luck_index, find the mean of goals and % Goals-to-shots
print("Type: ", df['% Goals-to-shots'].dtypes)
#seems like the type is object!

#let's convert it to float
def converter(x):
    return float(x.strip('%'))/100

df['% Goals-to-shots'] = df['% Goals-to-shots'].apply(converter)

print("Should be float now: ", df['% Goals-to-shots'].dtypes)

df.groupby(['luck_index']).agg({'Goals':'mean', '% Goals-to-shots':'mean'})

Type:  object
Should be float now:  float64


Unnamed: 0_level_0,Goals,% Goals-to-shots
luck_index,Unnamed: 1_level_1,Unnamed: 2_level_1
High,4.642857,0.117071
Medium,5.0,0.138
Low,6.0,0.093


#### 2.1 Univariate Analysis
- look at one variable
- main tools - countplot and displot
- countplot for discrete features
- displot for continuous features

#### 2.2 Multivariate analysis
- look at two featuress/labels at the same time
- e.g., see how bathroomms affect Rent -> bivaraint analysis
- e.g., see hwo size affect Rent
- usually, y is label and x is any giben feature
- Goal: which factor/features are somwhow related tomy label, i.e., Rent
- boxplot : for discrete vs continuous features
- scatterplot : for continuous vs continuous features

# 3. Feature Extraction / Selection