In [1]:
#importing libraries:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

Task 1: Data preparation

Load the dataset and perform necessary preprocessing steps: handling missing values, encoding categorical variables and spliting into features and Target.

In [2]:
data = pd.read_csv("../datasets/FPL_20_21.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 713 entries, 0 to 712
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   First_Name      713 non-null    object 
 1   Second_Name     713 non-null    object 
 2   Club            713 non-null    object 
 3   Goals_Scored    713 non-null    int64  
 4   Assists         713 non-null    int64  
 5   Total_Points    713 non-null    int64  
 6   Minutes         713 non-null    int64  
 7   Saves           713 non-null    int64  
 8   Goals_Conceded  713 non-null    int64  
 9   Creativity      713 non-null    float64
 10  Influence       713 non-null    float64
 11  Threat          713 non-null    int64  
 12  Bonus           713 non-null    int64  
 13  BPS             713 non-null    int64  
 14  ICT_Index       713 non-null    float64
 15  Clean_Sheets    713 non-null    int64  
 16  Red_Cards       713 non-null    int64  
 17  Yellow_Cards    713 non-null    int

In [3]:
#check for missing values:
data.isna().sum() #no missing values

First_Name        0
Second_Name       0
Club              0
Goals_Scored      0
Assists           0
Total_Points      0
Minutes           0
Saves             0
Goals_Conceded    0
Creativity        0
Influence         0
Threat            0
Bonus             0
BPS               0
ICT_Index         0
Clean_Sheets      0
Red_Cards         0
Yellow_Cards      0
Position          0
dtype: int64

In [4]:
#see first few rows:
data.head()

Unnamed: 0,First_Name,Second_Name,Club,Goals_Scored,Assists,Total_Points,Minutes,Saves,Goals_Conceded,Creativity,Influence,Threat,Bonus,BPS,ICT_Index,Clean_Sheets,Red_Cards,Yellow_Cards,Position
0,Bruno,Fernandes,MUN,18,14,244,3101,0,36,1414.9,1292.6,1253,36,870,396.2,13,0,6,MID
1,Harry,Kane,TOT,23,14,242,3083,0,39,659.1,1318.2,1585,40,880,355.9,12,0,1,FWD
2,Mohamed,Salah,LIV,22,6,231,3077,0,41,825.7,1056.0,1980,21,657,385.8,11,0,0,MID
3,Heung-Min,Son,TOT,17,11,228,3119,0,36,1049.9,1052.2,1046,26,777,315.2,13,0,0,MID
4,Patrick,Bamford,LEE,17,11,194,3052,0,50,371.0,867.2,1512,26,631,274.6,10,0,3,FWD


In [5]:
#handle Club and position as categorical variables:
#Lets see number of unique categories in the two columns:
clubs = data["Club"].value_counts().reset_index()
clubs #20 clubs

Unnamed: 0,Club,count
0,BHA,41
1,MUN,40
2,WOL,40
3,FUL,39
4,WBA,38
5,EVE,37
6,LEE,37
7,SOU,36
8,LIV,36
9,SHU,36


In [6]:
positions = data["Position"].value_counts().reset_index()
positions #4 positions

Unnamed: 0,Position,count
0,MID,295
1,DEF,246
2,FWD,92
3,GK,80


In [7]:
#encoding the club variable as a categorical feature, then the position variable will be our target variable:
data_2 = pd.get_dummies(data, columns=["Club"], dtype="int", drop_first=True)
data_2.head()

Unnamed: 0,First_Name,Second_Name,Goals_Scored,Assists,Total_Points,Minutes,Saves,Goals_Conceded,Creativity,Influence,...,Club_LIV,Club_MCI,Club_MUN,Club_NEW,Club_SHU,Club_SOU,Club_TOT,Club_WBA,Club_WHU,Club_WOL
0,Bruno,Fernandes,18,14,244,3101,0,36,1414.9,1292.6,...,0,0,1,0,0,0,0,0,0,0
1,Harry,Kane,23,14,242,3083,0,39,659.1,1318.2,...,0,0,0,0,0,0,1,0,0,0
2,Mohamed,Salah,22,6,231,3077,0,41,825.7,1056.0,...,1,0,0,0,0,0,0,0,0,0
3,Heung-Min,Son,17,11,228,3119,0,36,1049.9,1052.2,...,0,0,0,0,0,0,1,0,0,0
4,Patrick,Bamford,17,11,194,3052,0,50,371.0,867.2,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#splitting the data into features and target: remove the names columns since names are not variables in this dataset
X = data_2.drop(['First_Name','Second_Name','Position'], axis=1)

In [9]:
X.head()

Unnamed: 0,Goals_Scored,Assists,Total_Points,Minutes,Saves,Goals_Conceded,Creativity,Influence,Threat,Bonus,...,Club_LIV,Club_MCI,Club_MUN,Club_NEW,Club_SHU,Club_SOU,Club_TOT,Club_WBA,Club_WHU,Club_WOL
0,18,14,244,3101,0,36,1414.9,1292.6,1253,36,...,0,0,1,0,0,0,0,0,0,0
1,23,14,242,3083,0,39,659.1,1318.2,1585,40,...,0,0,0,0,0,0,1,0,0,0
2,22,6,231,3077,0,41,825.7,1056.0,1980,21,...,1,0,0,0,0,0,0,0,0,0
3,17,11,228,3119,0,36,1049.9,1052.2,1046,26,...,0,0,0,0,0,0,1,0,0,0
4,17,11,194,3052,0,50,371.0,867.2,1512,26,...,0,0,0,0,0,0,0,0,0,0


In [10]:
#change target to int type:
data_2['Position'] = np.where(data_2.Position == 'MID', 1, data_2.Position)
data_2['Position'] = np.where(data_2.Position == 'DEF', 2, data_2.Position)
data_2['Position'] = np.where(data_2.Position == 'FWD', 3, data_2.Position)
data_2['Position'] = np.where(data_2.Position == 'GK', 4, data_2.Position)

In [11]:
y = data_2['Position'].astype("int")
y

0      1
1      3
2      1
3      1
4      3
      ..
708    1
709    1
710    2
711    1
712    2
Name: Position, Length: 713, dtype: int32

### Task 2: Feature Selection

1) Perform feature selection using the chi-square test for independence. Select the top 3 most important features based on this test: In this dataset, we do not have enough categorical variables for the chi square test.
2) Perform feature selection using the ANOVA F test. Select the top 3 most important features based on this test:

Note:
- f_classif is most applicable where the input features are continuous and the outcome is categorical.
- f_regression is most applicable where the input features are continuous and the outcome is continuous.
- chi2 is best for when the both the input and outcome are categorical.

In [13]:
#for purposes of the model evaluation step, lets split into training and testing set:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [14]:
#lets explore numerical variables and categorical outcome, since we have chosen position as a categorical type target
X_train_numerical = X_train[['Goals_Scored','Assists','Total_Points','Minutes','Saves','Goals_Conceded','Creativity','Influence','Threat','Bonus',
                            'BPS','ICT_Index','Clean_Sheets','Red_Cards','Yellow_Cards']].copy()
X_train_numerical.head()

Unnamed: 0,Goals_Scored,Assists,Total_Points,Minutes,Saves,Goals_Conceded,Creativity,Influence,Threat,Bonus,BPS,ICT_Index,Clean_Sheets,Red_Cards,Yellow_Cards
487,0,0,2,36,0,0,4.4,5.4,0,0,6,0.9,0,0,1
101,9,5,106,1423,0,19,288.2,487.0,530,15,332,130.4,4,0,3
489,0,0,1,1,0,0,0.0,0.0,0,0,3,0.0,0,0,0
533,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0,0,0
271,1,1,48,1108,0,14,210.6,170.6,235,3,148,61.5,5,0,1


In [15]:
X_train_numerical.shape

(534, 15)

In [16]:
y_train.shape

(534,)

In [17]:
y_train

487    1
101    1
489    2
533    2
271    1
      ..
707    1
192    1
629    4
559    2
684    1
Name: Position, Length: 534, dtype: int32

In [18]:
#using the anova in f_classif to select 3 most features: numerical features and categorical target
select_test = SelectKBest(score_func=f_classif, k=3)
fit = select_test.fit(X_train_numerical, y_train)
scores = fit.scores_
features = fit.transform(X_train_numerical)
selected_indices = select_test.get_support(indices=True)

In [19]:
print('Feature Scores: ', scores)
print('Selected Features Indices: ', selected_indices)

Feature Scores:  [21.65060038 12.40847912  2.52298362  4.50276758 42.57643957  3.70769096
 16.50262798  2.17900745 26.47292016  3.78724762  4.94275023  9.19751203
  4.20643039  4.38875944 11.71180135]
Selected Features Indices:  [0 4 8]


In [20]:
#see the 3 selected features:
X_train_numerical.iloc[:, [0,4,8]]

Unnamed: 0,Goals_Scored,Saves,Threat
487,0,0,0
101,9,0,530
489,0,0,0
533,0,0,0
271,1,0,235
...,...,...,...
707,0,0,0
192,0,0,189
629,0,0,0
559,0,0,0


From the ANOVA F test above, the best 3 features are: Goals_Scored, Saves and Threat

3. Combine the features selected from both the chi-square test and ANOVA F test into a final set of features:
Since, only the ANOVA test was possible for our choice of categorical target, let's combine the 3 selected features with the Club Categorical independent variable

In [21]:
#drop all columns apart from best 3 and categorical columns:
new_X_train = X_train.drop(['Assists','Total_Points','Minutes','Goals_Conceded','Creativity','Influence','Bonus',
                            'BPS','ICT_Index','Clean_Sheets','Red_Cards','Yellow_Cards'], axis=1)

In [22]:
new_X_train.head()

Unnamed: 0,Goals_Scored,Saves,Threat,Club_AVL,Club_BHA,Club_BUR,Club_CHE,Club_CRY,Club_EVE,Club_FUL,...,Club_LIV,Club_MCI,Club_MUN,Club_NEW,Club_SHU,Club_SOU,Club_TOT,Club_WBA,Club_WHU,Club_WOL
487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
101,9,0,530,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
489,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
533,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
271,1,0,235,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Task 3: Model Evaluation

4) Train a machine learning model e.g logistic regression using the selected features: using decision tree since categorical target has four possible outcomes

In [23]:
#what are the dimensions of selected X_Train and y_train:
new_X_train.shape

(534, 22)

In [24]:
y_train.shape

(534,)

In [25]:
y_train

487    1
101    1
489    2
533    2
271    1
      ..
707    1
192    1
629    4
559    2
684    1
Name: Position, Length: 534, dtype: int32

In [26]:
y_test

338    1
142    1
242    1
235    1
468    2
      ..
399    2
77     3
215    2
492    2
109    2
Name: Position, Length: 179, dtype: int32

In [27]:
#select X_test columns corresponding to new train x:
new_X_test = X_test.drop(['Assists','Total_Points','Minutes','Goals_Conceded','Creativity','Influence','Bonus',
                            'BPS','ICT_Index','Clean_Sheets','Red_Cards','Yellow_Cards'], axis=1)
new_X_test.shape

(179, 22)

In [28]:
#the target variable has four possible outcomes: train a decision tree classifier
np.random.seed(0)
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(new_X_train, y_train)

In [31]:
dt_classifier.score(new_X_test, y_test)

0.4748603351955307

In [32]:
Y_predicted = dt_classifier.predict(X = new_X_test)
Y_predicted

array([1, 3, 1, 3, 1, 1, 1, 3, 1, 1, 2, 3, 4, 3, 1, 4, 1, 1, 2, 1, 3, 1,
       3, 1, 1, 1, 1, 3, 4, 1, 3, 2, 1, 1, 2, 1, 1, 2, 2, 1, 3, 3, 1, 4,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 4, 1, 2, 3, 1, 4, 1,
       1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 4, 1, 1,
       1, 2, 1, 2, 4, 1, 1, 3, 2, 2, 1, 2, 1, 2, 1, 1, 1, 4, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 4, 1, 2, 1, 1, 1, 4, 1, 1, 1, 1, 1, 2, 2, 3, 1,
       2, 1, 2, 1, 1, 3, 2, 2, 1, 4, 2, 2, 1, 2, 4, 3, 2, 1, 1, 1, 1, 2,
       4, 4, 1, 1, 2, 2, 2, 4, 1, 1, 1, 2, 1, 1, 1, 1, 2, 3, 2, 3, 2, 1,
       2, 1, 1])

In [33]:
np.array(y_test)

array([1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 4, 2, 1, 4, 2, 1, 2, 4, 3, 1,
       1, 1, 1, 1, 2, 3, 2, 1, 2, 2, 2, 2, 1, 1, 3, 3, 2, 4, 1, 2, 2, 4,
       4, 3, 2, 1, 3, 3, 2, 1, 1, 2, 4, 3, 1, 2, 2, 4, 2, 3, 3, 2, 2, 3,
       4, 2, 2, 3, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 3, 4, 1, 1,
       4, 1, 1, 1, 4, 1, 1, 3, 2, 2, 1, 1, 2, 2, 1, 1, 2, 4, 1, 1, 3, 2,
       2, 2, 1, 2, 1, 1, 4, 1, 1, 1, 2, 1, 4, 1, 2, 4, 3, 1, 2, 3, 1, 2,
       1, 4, 2, 1, 2, 1, 1, 2, 1, 4, 1, 1, 1, 1, 4, 3, 1, 1, 1, 1, 1, 3,
       4, 4, 2, 1, 2, 4, 2, 4, 2, 2, 2, 3, 3, 2, 1, 3, 2, 2, 2, 1, 2, 3,
       2, 2, 2])

5. Evaluate the model's performance using appropriate evaluation metrics:

In [34]:
#classification accuracy = no. of correct predictions/total number of predictions:
def classificationAccuracyDt(X_test, Y_test):
    #get predictions:
    Y_predicted = dt_classifier.predict(X = X_test)

    #using true = 1 and false = 0 to count matches:
    no_correct = (Y_predicted != Y_test).sum()
    
    acc = no_correct/(Y_test.count())
    return acc

In [36]:
#calculating classification accuracy of the descision tree model on the test set with chosen features:
classificationAccuracyDt(X_test=new_X_test, Y_test=y_test)

0.5251396648044693