########## Step 1: One Hot encoding our categorical data ########## 
#Goal: pre-processing data for feature selection with RFE ( Recursive Feature Elimination)

In [2]:
import numpy as np
import pandas as pd

# import preprocessing from sklearn
from sklearn import preprocessing

In [3]:
# load dataset
pathDataset = "../Dataset/results.xlsx"
original_dataset = pd.read_excel(pathDataset)

#Format date column to date time
#dataset['date'] = pd.to_datetime(X.date)
original_dataset['date'] =  pd.to_datetime(original_dataset['date'], format='%Y/%m/%d')
original_dataset['date'] =  pd.to_datetime(original_dataset['date'], format='%Y%-m-%d')
#dataset['date'] =pd.to_datetime(dataset['date'], format='%Y/%m/%d', errors='ignore')
#dataset['date'] =pd.to_datetime(dataset['date'], format='%Y%-m-%d', errors='ignore')

#X['date'] = X['date'].dt.strftime('%Y/%m/%d')
original_dataset.dtypes

#Display 3 first rows
original_dataset.head(3)


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland
1,1872-11-30,England,Scotland,4,2,Friendly,London,England
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland


In [4]:
# limit to categorical data using df.select_dtypes()
columns_categorical_subset = original_dataset.select_dtypes(include=[object])
columns_categorical_subset.head(3)

Unnamed: 0,home_team,away_team,tournament,city,country
0,Scotland,England,Friendly,Glasgow,Scotland
1,England,Scotland,Friendly,London,England
2,Scotland,England,Friendly,Glasgow,Scotland


In [5]:
# check original shape
columns_categorical_subset.shape

(38685, 5)

In [6]:
# view columns using df.columns
columns_categorical_subset.columns

Index(['home_team', 'away_team', 'tournament', 'city', 'country'], dtype='object')

In [7]:
# TODO: create a LabelEncoder object and fit it to each feature in X


# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()
le

# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
final_dataset = original_dataset.apply(le.fit_transform)
final_dataset.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country
0,0,184,64,0,0,47,559,201
1,0,65,180,4,2,47,890,68
2,1,184,64,2,1,47,559,201
3,2,65,180,2,2,47,890,68
4,3,184,64,3,0,47,559,201


In [8]:
# TODO: create a OneHotEncoder object, and fit it to all of X

# 1. INSTANTIATE
enc = preprocessing.OneHotEncoder()

# 2. FIT
enc.fit(final_dataset)

# 3. Transform
onehotlabels = enc.transform(final_dataset).toarray()
onehotlabels.shape
# Explication: 
# as you can see, you've the same number of rows 891
# but now you've so many more columns due to how we changed all the categorical data into numerical data

(38685, 17429)

In [9]:
onehotlabels

array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [10]:
type(onehotlabels)

numpy.ndarray

# Step 2: Selection Featrue with different Algorithm

In [11]:
#final_dataset = pd.DataFrame(new_dataset)
final_dataset.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country
0,0,184,64,0,0,47,559,201
1,0,65,180,4,2,47,890,68
2,1,184,64,2,1,47,559,201
3,2,65,180,2,2,47,890,68
4,3,184,64,3,0,47,559,201


In [12]:
final_dataset_values = final_dataset.values
X = final_dataset_values[:,0:7]
Y = final_dataset_values[:,7]

In [15]:
Y

array([201,  68, 201, ..., 236,  10, 184])

# Univariate Selection

Statistical tests can be used to select those features that have the strongest relationship with the output variable.

In [19]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

# summarize scores
numpy.set_printoptions(precision=3)

print("#######################################")
print(fit.scores_)
# summarize selected features
print("#######################################")
features = fit.transform(X)
print(features[0:5,:])

#######################################
[  1.698e+07   9.663e+05   8.575e+04   3.399e+03   3.931e+03   8.677e+04
   6.702e+06]
#######################################
[[  0 184  47 559]
 [  0  65  47 890]
 [  1 184  47 559]
 [  2  65  47 890]
 [  3 184  47 559]]


# RFE Algorithm
The Recursive Feature Elimination (RFE) method is a feature selection approach. It works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.

In [14]:
# Import packages
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# feature extraction
model = LogisticRegression()
rfe = RFE(model, 4)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 4
Selected Features: [False  True False  True  True  True False]
Feature Ranking: [4 1 3 1 1 1 2]


# Principal Component Analyse Algorithm

In [23]:
#Import Package
from sklearn.decomposition import PCA

# feature extraction
pca = PCA(n_components=4)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [  9.841e-01   1.532e-02   2.808e-04   2.734e-04]
[[ -1.000e+00  -5.553e-04   6.992e-04   5.248e-05   4.311e-05   1.161e-04
   -2.722e-03]
 [ -2.720e-03  -5.993e-03  -3.172e-03   2.079e-05   1.177e-05   2.468e-03
    1.000e+00]
 [ -8.740e-04   4.530e-01  -8.915e-01   2.397e-04  -3.637e-04   4.176e-03
   -1.261e-04]
 [ -1.980e-04   8.915e-01   4.529e-01  -2.686e-05   1.303e-06  -9.024e-03
    6.801e-03]]


# Feature Importance Algorithm
You can see that we are given an importance score for each attribute where the larger score the more important the attribute. The scores suggest at the importance of plas, age and mass.

In [22]:
from sklearn.ensemble import ExtraTreesClassifier

# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[ 0.107  0.316  0.074  0.044  0.04   0.106  0.312]
