### Exercise 1: Use heart failure dataset to create a model and calculate its metrics

In [1]:
#import libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
#import dataset:
heart_df = pd.read_csv("./heart_failure_clinical_records_dataset.csv")
heart_df.shape

(299, 13)

In [3]:
#see top rows:
heart_df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [4]:
#see info
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


all variables are int or float type so okay to proceed without factorization

In [6]:
#check for missing values:
heart_df.isna().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

no missing values, so okay to proceed.

In [7]:
#use a decision tree here: scaling not really required:
#choose X and Y:
X = heart_df.drop("DEATH_EVENT", axis=1)
y = heart_df['DEATH_EVENT']
X.shape

(299, 12)

In [8]:
#split into training and testing:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape)
print(y_test.shape)

(209, 12)
(90,)


In [9]:
#fitting decision tree model:
heart_tree = DecisionTreeClassifier()
heart_tree.fit(X_train, y_train)
#see score on test set:
heart_tree.score(X_test, y_test)*100

78.88888888888889

Metrics:

In [10]:
#view the confusion matrix for the decision tree model:
#first get the predicted y:
y_pred = heart_tree.predict(X_test)
y_pred


array([0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0], dtype=int64)

In [11]:
#confusion matrix:
heart_conf_mat = confusion_matrix(y_test, y_pred)
heart_conf_mat = pd.DataFrame(heart_conf_mat, index=[0,1], columns=[0,1])
heart_conf_mat

Unnamed: 0,0,1
0,53,9
1,10,18


In [12]:
#generate the classification report with all metrics:
heart_df_report = classification_report(y_test, y_pred, output_dict=True)
heart_df_report = pd.DataFrame(heart_df_report).transpose()
heart_df_report

Unnamed: 0,precision,recall,f1-score,support
0,0.84127,0.854839,0.848,62.0
1,0.666667,0.642857,0.654545,28.0
accuracy,0.788889,0.788889,0.788889,0.788889
macro avg,0.753968,0.748848,0.751273,90.0
weighted avg,0.786949,0.788889,0.787814,90.0


### Exercise 2: Use a pipeline to predict either genre or platform in the video games dataset

In [7]:
#import dataset:
from dataidea.datasets import loadDataset
vgsales = loadDataset("vgsales")
vgsales.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [8]:
#see no. of rows and columns:
vgsales.shape

(16598, 11)

In [9]:
#see columns info:
vgsales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


In [10]:
#how many unique names:
vgsales["Name"].value_counts().reset_index()

Unnamed: 0,Name,count
0,Need for Speed: Most Wanted,12
1,Ratatouille,9
2,FIFA 14,9
3,LEGO Marvel Super Heroes,9
4,Madden NFL 07,9
...,...,...
11488,Ar tonelico Qoga: Knell of Ar Ciel,1
11489,Galaga: Destination Earth,1
11490,Nintendo Presents: Crossword Collection,1
11491,TrackMania: Build to Race,1


There are 11493 names, so this column will be excluded from the features, together with Rank

In [11]:
#select features for further steps:
vgsales_df = vgsales.drop(["Rank","Name"], axis=1)
vgsales_df.shape

(16598, 9)

In [13]:
#choose the target column from either genre or platform:
vgsales_df["Genre"].value_counts().reset_index()
#12 unique values

Unnamed: 0,Genre,count
0,Action,3316
1,Sports,2346
2,Misc,1739
3,Role-Playing,1488
4,Shooter,1310
5,Adventure,1286
6,Racing,1249
7,Platform,886
8,Simulation,867
9,Fighting,848


In [14]:
#choose the target column from either genre or platform:
vgsales_df["Platform"].value_counts().reset_index()
#30 unique values

Unnamed: 0,Platform,count
0,DS,2163
1,PS2,2161
2,PS3,1329
3,Wii,1325
4,X360,1265
5,PSP,1213
6,PS,1196
7,PC,960
8,XB,824
9,GBA,822


Use genre as the target column. There are only 8 remaining columns, so decided not to do feature selection

In [15]:
#factorising the rest of the object columns: platform, publisher:
vgsales_df[["Platform","Publisher"]] = vgsales_df[["Platform","Publisher"]].apply(lambda x: pd.factorize(x)[0])
#see result:
vgsales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Platform      16598 non-null  int64  
 1   Year          16327 non-null  float64
 2   Genre         16598 non-null  object 
 3   Publisher     16598 non-null  int64  
 4   NA_Sales      16598 non-null  float64
 5   EU_Sales      16598 non-null  float64
 6   JP_Sales      16598 non-null  float64
 7   Other_Sales   16598 non-null  float64
 8   Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 1.1+ MB


In [16]:
vgsales_df.head()

Unnamed: 0,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,0,2006.0,Sports,0,41.49,29.02,3.77,8.46,82.74
1,1,1985.0,Platform,0,29.08,3.58,6.81,0.77,40.24
2,0,2008.0,Racing,0,15.85,12.88,3.79,3.31,35.82
3,0,2009.0,Sports,0,15.75,11.01,3.28,2.96,33.0
4,2,1996.0,Role-Playing,0,11.27,8.89,10.22,1.0,31.37


In [17]:
#standardise the columns that require standardising:
scaler = StandardScaler()
standardised_col = scaler.fit_transform(vgsales_df[["NA_Sales","EU_Sales","JP_Sales","Other_Sales","Global_Sales"]])
standardised_col = pd.DataFrame(standardised_col, columns=["NA_Sales","EU_Sales","JP_Sales","Other_Sales","Global_Sales"])
standardised_col.head()


Unnamed: 0,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,50.480508,57.13693,11.938058,44.606085,52.864025
1,35.284437,6.794188,21.767296,3.828224,25.532503
2,19.084273,25.197785,12.002724,17.297115,22.690025
3,18.961823,21.497277,10.35374,15.441165,20.876498
4,13.476053,17.302048,32.792857,5.047848,19.828254


In [18]:
#add standardised columns to the other columns:
vgsales_df_stand = pd.concat([standardised_col, vgsales_df[['Platform','Year','Genre','Publisher']]], axis=1)
vgsales_df_stand.head()

Unnamed: 0,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Platform,Year,Genre,Publisher
0,50.480508,57.13693,11.938058,44.606085,52.864025,0,2006.0,Sports,0
1,35.284437,6.794188,21.767296,3.828224,25.532503,1,1985.0,Platform,0
2,19.084273,25.197785,12.002724,17.297115,22.690025,0,2008.0,Racing,0
3,18.961823,21.497277,10.35374,15.441165,20.876498,0,2009.0,Sports,0
4,13.476053,17.302048,32.792857,5.047848,19.828254,2,1996.0,Role-Playing,0


In [19]:
#select X and y:
X = vgsales_df_stand.drop("Genre", axis=1)
y = vgsales_df_stand.Genre

In [20]:
#split into train and test: trying stratified splitting to see if there is improvement in accuracy from 28.7%
np.random.seed(0)
ss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
train_index, test_index = next(ss.split(X,y))

In [21]:
train_index

array([ 4611,   457,  8276, ...,  6115, 11408, 12174])

In [23]:
X_train = X.iloc[train_index]
X_test = X.iloc[test_index]
y_train = y.iloc[train_index]
y_test = y.iloc[test_index]

In [24]:
X_test.shape

(3320, 8)

In [25]:
#since the target column is for classification, lets use a decision tree classifer:
#fitting decision tree model:
vgsales_tree = DecisionTreeClassifier()
vgsales_tree.fit(X_train, y_train)
#see score on test set:
vgsales_tree.score(X_test, y_test)*100

27.981927710843372

In [26]:
#see predictions:
y_pred = vgsales_tree.predict(X_test)
y_pred

array(['Action', 'Shooter', 'Misc', ..., 'Role-Playing', 'Platform',
       'Action'], dtype=object)

In [27]:
#unique values in y:
y_test.value_counts().reset_index

<bound method Series.reset_index of Genre
Action          663
Sports          469
Misc            348
Role-Playing    298
Shooter         262
Adventure       257
Racing          250
Platform        177
Simulation      174
Fighting        170
Strategy        136
Puzzle          116
Name: count, dtype: int64>

In [28]:
#see confusion matrix:
vgsales_conf_mat = confusion_matrix(y_test, y_pred)
vgsales_conf_mat = pd.DataFrame(vgsales_conf_mat, index=['Action','Sports','Misc','Role-Playing','Shooter','Adventure','Racing','Platform',
                                                         'Simulation','Fighting','Strategy','Puzzle'], columns=['Action','Sports','Misc',
                                                         'Role-Playing','Shooter','Adventure','Racing','Platform',
                                                         'Simulation','Fighting','Strategy','Puzzle'])
vgsales_conf_mat


Unnamed: 0,Action,Sports,Misc,Role-Playing,Shooter,Adventure,Racing,Platform,Simulation,Fighting,Strategy,Puzzle
Action,226,46,17,65,38,11,43,55,48,25,69,20
Sports,43,83,13,17,13,8,11,17,14,12,21,5
Misc,33,12,29,12,6,4,13,18,9,7,18,9
Role-Playing,47,20,17,105,20,15,22,15,18,13,43,13
Shooter,40,6,8,22,36,8,9,7,18,6,13,4
Adventure,11,9,4,22,6,12,12,11,8,7,8,6
Racing,41,10,15,19,11,8,52,8,22,23,36,5
Platform,41,17,18,24,16,8,16,93,18,11,17,19
Simulation,43,12,12,17,13,3,25,19,60,13,32,13
Fighting,31,16,6,24,6,6,8,15,13,31,10,8


In [29]:
#see metrics:
#generate the classification report with all metrics:
vgsales_report = classification_report(y_test, y_pred, output_dict=True)
vgsales_report = pd.DataFrame(vgsales_report).transpose()
vgsales_report

Unnamed: 0,precision,recall,f1-score,support
Action,0.359873,0.340875,0.350116,663.0
Adventure,0.314394,0.322957,0.318618,257.0
Fighting,0.188312,0.170588,0.179012,170.0
Misc,0.278515,0.301724,0.289655,348.0
Platform,0.180905,0.20339,0.191489,177.0
Puzzle,0.125,0.103448,0.113208,116.0
Racing,0.197719,0.208,0.202729,250.0
Role-Playing,0.322917,0.312081,0.317406,298.0
Shooter,0.218182,0.229008,0.223464,262.0
Simulation,0.166667,0.178161,0.172222,174.0


In [30]:
#pipeline for predictions:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('model', DecisionTreeClassifier())
])

In [31]:
pipe.fit(X_train, y_train)
pipe_train_score = pipe.score(X_train, y_train)
pipe_test_score = pipe.score(X_test, y_test)
pipe_pred_y = pipe.predict(X_test)

In [32]:
#see pipe results
print("Training accuracy: ", pipe_train_score)
print("Testing accuracy: ", pipe_test_score)
print(pipe_pred_y)

Training accuracy:  0.9748456092785058
Testing accuracy:  0.28343373493975904
['Racing' 'Misc' 'Action' ... 'Role-Playing' 'Platform' 'Action']
