## Data reading

In [56]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [57]:
df = pd.read_csv('../data/iris.csv',header=None)

In [58]:
df.head(10)

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [59]:
df.columns = ['x1','x2','x3','x4','y']

In [60]:
X = df.drop('y',axis=1)
y = df['y']

In [61]:
le = LabelEncoder()
le.fit(y)
y = le.transform(y)

In [62]:
le.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [63]:
z = le.inverse_transform([0,1,2])
z

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [64]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=1)

In [65]:
X_train.head()

Unnamed: 0,x1,x2,x3,x4
39,5.1,3.4,1.5,0.2
36,5.5,3.5,1.3,0.2
117,7.7,3.8,6.7,2.2
139,6.9,3.1,5.4,2.1
107,7.3,2.9,6.3,1.8


In [66]:
X_train.isna().sum()

x1    0
x2    0
x3    0
x4    0
dtype: int64

## Transformers

In [67]:
# Transformers

pca = PCA()
standard = StandardScaler()

numeric_transformer = Pipeline([
    ('pca',pca),
    ('standard_scailer',standard)
    ])

# output_transformer = Pipeline([
#     ('le',le)
# ])

In [68]:
# Column separations
numeric_cols = df.columns[:-1]
# output_cols = df.columns[-1]

In [69]:
# Column Transformer
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num',numeric_transformer,numeric_cols),
    # ('output',output_transformer,output_cols)
])

## Fitting models

In [70]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

rfc = Pipeline([
    ('preprocessor',preprocessor),
    ('clf',clf)
])

In [71]:
rfc.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('pca',
                                                                   PCA()),
                                                                  ('standard_scailer',
                                                                   StandardScaler())]),
                                                  Index(['x1', 'x2', 'x3', 'x4'], dtype='object'))])),
                ('clf', RandomForestClassifier())])

In [72]:
pred = rfc.predict(X_test)
pred
# le.inverse_transform(pred)

array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 1, 2,
       1, 2, 0, 0, 0, 1])

In [73]:
le.inverse_transform(pred)

array(['Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',
       'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-setosa', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor'], dtype=object)

In [74]:
print(classification_report(y_test,pred))

precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.95      0.95      0.95        19
           2       0.93      0.93      0.93        14

    accuracy                           0.96        50
   macro avg       0.96      0.96      0.96        50
weighted avg       0.96      0.96      0.96        50



In [75]:
accuracy_score(y_test,pred)

0.96

## Model Selection

In [76]:
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


In [77]:
classifiers = [
    LinearSVC(),
    NuSVC(),
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()
]

In [78]:
for clf in classifiers:
    pipe = Pipeline([
    ('preprocessor',preprocessor),
    ('clf',clf)])
    pipe.fit(X_train,y_train)
    pred = pipe.predict(X_test)

    print(clf)
    print(f'accuracy score: {accuracy_score(y_test,pred): .4f}')
    print(classification_report(y_test,pred))
    print('\n')

LinearSVC()
accuracy score:  0.9000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.94      0.79      0.86        19
           2       0.76      0.93      0.84        14

    accuracy                           0.90        50
   macro avg       0.90      0.91      0.90        50
weighted avg       0.91      0.90      0.90        50



NuSVC()
accuracy score:  0.9200
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.89      0.89      0.89        19
           2       0.86      0.86      0.86        14

    accuracy                           0.92        50
   macro avg       0.92      0.92      0.92        50
weighted avg       0.92      0.92      0.92        50



SVC()
accuracy score:  0.9200
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.89    

So the best Linear Discrimination Analysis is looking like the best predicting model

## Grid search CV for best pipeline

In [117]:
from sklearn.model_selection import GridSearchCV


In [142]:
pipe = Pipeline([
    ('preprocessor',preprocessor),
    ('clf',LinearDiscriminantAnalysis())
])

In [143]:
param = {
    'preprocessor__num__pca__n_components':[1,2,3]
}

In [144]:
grid = GridSearchCV(pipe,param_grid=param,cv=5)

In [145]:
pipe.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('pca', PCA()),
                                                    ('standard_scailer',
                                                     StandardScaler())]),
                                    Index(['x1', 'x2', 'x3', 'x4'], dtype='object'))])),
  ('clf', LinearDiscriminantAnalysis())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('pca', PCA()),
                                                  ('standard_scailer',
                                                   StandardScaler())]),
                                  Index(['x1', 'x2', 'x3', 'x4'], dtype='object'))]),
 'clf': LinearDiscriminantAnalysis(),
 'preprocessor__n_jobs': None,
 'preprocessor__remainder': 'drop',
 'preprocessor__sparse_threshold': 0.3,
 'preprocessor__transformer_weights': None,
 'prep

In [146]:
grid.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('pca',
                                                                                          PCA()),
                                                                                         ('standard_scailer',
                                                                                          StandardScaler())]),
                                                                         Index(['x1', 'x2', 'x3', 'x4'], dtype='object'))])),
                                       ('clf', LinearDiscriminantAnalysis())]),
             param_grid={'preprocessor__num__pca__n_components': [1, 2, 3]})

In [147]:
grid.best_estimator_.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('pca', PCA(n_components=3)),
                                                    ('standard_scailer',
                                                     StandardScaler())]),
                                    Index(['x1', 'x2', 'x3', 'x4'], dtype='object'))])),
  ('clf', LinearDiscriminantAnalysis())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('pca', PCA(n_components=3)),
                                                  ('standard_scailer',
                                                   StandardScaler())]),
                                  Index(['x1', 'x2', 'x3', 'x4'], dtype='object'))]),
 'clf': LinearDiscriminantAnalysis(),
 'preprocessor__n_jobs': None,
 'preprocessor__remainder': 'drop',
 'preprocessor__sparse_threshold': 0.3,
 'preprocessor__transf

In [148]:
grid.best_params_

{'preprocessor__num__pca__n_components': 3}