# Task 1 - Dominik Wiśniewski

The description of the task shows that the task consists of classification into three different classes.

My idea for this task looks like this: I will first check the data for duplicates, outliers and missing values in the records. Then, without any processing of my data, I will teach an artificial neural network and evaluate its effectiveness. Then I will use simpler models to speed up learning, after assessing the models I will process the data and try to improve all the metrics used in this task (precision, recall, and f1 score).

In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, accuracy_score, f1_score, recall_score, precision_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model.ridge import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
# load file
dataset = pd.read_csv('Graduate - IRISES dataset (2019-06).csv', sep="|")
print(dataset)

In [None]:
dataset.info()

In [None]:
dataset.columns

In [None]:
dataset.isnull().sum()

In [None]:
dataset = dataset.dropna()

In [None]:
print(len(dataset))
dataset.drop_duplicates(keep=False,inplace=True)
print(len(dataset))

In [None]:
dataset.shape

In [None]:
dataset.head()

In [None]:
dataset.tail()

In [None]:
dataset.describe()

In [None]:
dataset.corr()

In [None]:
f,ax = plt.subplots(figsize=(10,10))
sns.heatmap(dataset.corr(), annot = True,linewidths=.4, fmt='.1f', ax=ax)
plt.show()

In [None]:
dataset['Species'].value_counts().plot(kind='bar')
plt.title(f"Zrównoważenie klas")
plt.xlabel(f"Species")
plt.ylabel("Ilość")
plt.show()
plt.close()

In [None]:
sns.pairplot(dataset)

The dataset had one empty element that was not significant against background 149, so instead of filling in the missing data with the median or average of the remaining values, the record was simply deleted. There were also two duplicates that were removed. One of the records had a comma instead of a dot, but the record was also only one so instead of using the replace () method I corrected it manually. There is a large correlation between some of the features, which can slow down and hinder the learning of some models. Classes are balanced, so no downsampling or upsampling.

The next step will be learning the machine learning model. The first model will be an artificial neural network due to its high ability to generalize and deal with correlated data. In most problems related to function approximation, one hidden layer is sufficient to approximate discrete labels (Basheer and Hajmeer, 2000).

In [None]:
# helping mapping function to make one hot encoding
def map_labels(labels: np.array) -> list:
        """
        Mapping iris data labels to categorical values
        :param labels: numpy.Arrays contains labels
        :return: list of mapped values
        """
        mapped = [
            np.array([1, 0, 0]) if x == 'setosa' else np.array([0, 1, 0]) if x == 'versicolor' else np.array(
                [0, 0, 1]) for x in labels]
        return mapped

seed = 1234
test_size = 0.5
    
# preprocessing data
train_array = dataset.values
np.random.shuffle(train_array)

X = train_array[:, 0:4].astype(float)
Y = train_array[:, 4]

Y = np.array(map_labels(Y)).astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [None]:
# defining model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(8, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(3, activation=tf.nn.softmax))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model_name = 'neural_network_model'
if os.path.exists(model_name):
    model = tf.keras.models.load_model(model_name)
else:
    model.fit(X_train, Y_train, batch_size=8, epochs=100, workers=4, use_multiprocessing=True, verbose=0)

val_loss, val_acc = model.evaluate(X_test, Y_test, verbose=0)

print(f"Accuracy: {val_acc}, loss: {val_loss}")

The neural network model achieved very high results when split the set into training and test in the 50% -50% ratio, and it did not require further work on the data. But neural network models sometimes consume many times higher resources than simpler models, and it's worth checking how other models will do. At first with the default parameters.

In [None]:
def map_labels(labels):
        """
        Maping iris data labels to numeric
        :param labels: numpy.Arrays contains labels
        :return: list of mapped values
        """
        maped = [0.0 if x == 'setosa' else 1.0 if x == 'versicolor' else 2.0 for x in labels]
        return maped

seed = 1234
test_size = 0.5
    
# preprocessing data
train_array = dataset.values
np.random.shuffle(train_array)

X = train_array[:, 0:4].astype(float)
Y = train_array[:, 4]

Y = np.array(map_labels(Y)).astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [None]:
models = [LogisticRegression(),
         DecisionTreeClassifier(),
         KNeighborsClassifier(),
         GaussianNB(),
         RandomForestClassifier(), 
         GradientBoostingClassifier(),
         SVC()]

In [None]:
TestModels = pd.DataFrame()
tmp = {}

for model in models:
    model_obj = str(model)
    model_name = model_obj[:model_obj.index('(')]
    tmp['Model'] = model_name
    
    print(f"Calculating {model_name} model")
    
    print("Start training model...")
    if os.path.exists('Default/{}_model.npy'.format(model_name)):
        try:
            model = np.load('Default/{}_model.npy'.format(model_name), allow_pickle=True).item()
        except ValueError:
            model.fit(X_train, Y_train)
            np.save(f"Default/{model_name}_model.npy", model)
            
    else:
        model.fit(X_train, Y_train)
        np.save(f"Default/{model_name}_model.npy", model)
    
    print("Calculating scores....")
    tmp['Accuracy'] = accuracy_score(Y_test, model.predict(X_test))
    tmp['Precision'] = precision_score(Y_test, model.predict(X_test), average='weighted')
    tmp['Recall'] = recall_score(Y_test, model.predict(X_test), average='weighted')
    tmp['F1_Score'] = f1_score(Y_test, model.predict(X_test), average='weighted')
    
    print(f"Accuracy: {tmp['Accuracy']}\nPrecision: {tmp['Precision']}\nRecall: {tmp['Recall']}\nF1 Score: {tmp['F1_Score']}")
    
    TestModels = TestModels.append([tmp])

TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.Accuracy.plot(ax=axes, kind='bar', title='Accuracy')
plt.show()

#TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.Precision.plot(ax=axes, kind='bar', title='Precision')
plt.show()

#TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.Recall.plot(ax=axes, kind='bar', title='Recall')
plt.show()

#TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.F1_Score.plot(ax=axes, kind='bar', title='F1_Score')
plt.show()

Simpler machine learning models have also done well according to each of the metrics used. However, their effectiveness can be further improved. I will start by reducing dimensions.

In [None]:
pca_model = PCA(n_components=2)
pca_model.fit(X)
X_2D = pca_model.transform(X)
X2D_train, X2D_test, Y_train, Y_test = train_test_split(X_2D, Y, test_size=0.5, random_state=seed)

In [None]:
TestModels = pd.DataFrame()
tmp = {}

for model in models:
    model_obj = str(model)
    model_name = model_obj[:model_obj.index('(')]
    tmp['Model'] = model_name
    
    print(f"Calculating {model_name} model")
    
    print("Start training model...")
    if os.path.exists('DefPCA/{}_model.npy'.format(model_name)):
        try:
            model = np.load('DefPCA/{}_model.npy'.format(model_name), allow_pickle=True).item()
        except ValueError:
            model.fit(X2D_train, Y_train)
            np.save(f"DefPCA/{model_name}_model.npy", model)
            
    else:
        model.fit(X2D_train, Y_train)
        np.save(f"DefPCA/{model_name}_model.npy", model)
    
    print("Calculating scores....")
    tmp['Accuracy'] = accuracy_score(Y_test, model.predict(X2D_test))
    tmp['Precision'] = precision_score(Y_test, model.predict(X2D_test), average='weighted')
    tmp['Recall'] = recall_score(Y_test, model.predict(X2D_test), average='weighted')
    tmp['F1_Score'] = f1_score(Y_test, model.predict(X2D_test), average='weighted')
    
    print(f"Accuracy: {tmp['Accuracy']}\nPrecision: {tmp['Precision']}\nRecall: {tmp['Recall']}\nF1 Score: {tmp['F1_Score']}")
    
    TestModels = TestModels.append([tmp])

TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.Accuracy.plot(ax=axes, kind='bar', title='Accuracy')
plt.show()

#TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.Precision.plot(ax=axes, kind='bar', title='Precision')
plt.show()

#TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.Recall.plot(ax=axes, kind='bar', title='Recall')
plt.show()

#TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.F1_Score.plot(ax=axes, kind='bar', title='F1_Score')
plt.show()

Reducing dimensions to 2 has accelerated learning that has minimized the effectiveness of some models at the same time. Increasing their effectiveness is even more possible by optimizing the hyperparameters of individual models or scaling data, for example, to a scale of 0-1. I decided to limit myself to automating the process of optimizing hyperparameters.

In [None]:
models_with_parameters = [

    {
        'model_name': 'Logistic Regression',
        'model': LogisticRegression,
        'params': {
            'C': [1, 3, 5, 7, 9],
            'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
            'n_jobs': [4],
        }
    }, 
    {
        'model_name': 'Decision Tree',
        'model': DecisionTreeClassifier,
        'params': {
            'max_depth': np.arange(2, 4),
        }
    }, 
    {
        'model_name': 'KNN',
        'model': KNeighborsClassifier,
        'params': {
            'n_neighbors': [5, 10, 15, 20, 25, 30],
            'weights': ('uniform', 'distance'),
            'algorithm': ('ball_tree', 'kd_tree', 'brute'),
        }
    }, 
    {
        'model_name': 'GBM',
        'model': GradientBoostingClassifier,
        'params': {
            'n_estimators': [100],
        }
    },
    {
        'model_name': 'Bayes',
        'model': GaussianNB,
        'params': {
            'var_smoothing': [1e-7, 1e-8, 1e-9, 1e-10, 1e-11]
        }
    },
    {
        'model_name': 'RandomForest',
        'model': RandomForestClassifier,
        'params': {
            'n_estimators': [100]
        }
    },
    {
        'model_name': 'SVM',
        'model': SVC,
        'params': {
            'kernel': ('linear', 'rbf', 'poly'),
            'gamma': [0.0001, 0.001, 0.01],
            'C': [1, 3, 5, 7, 9]
        }
    }]

In [None]:
TestModels = pd.DataFrame()
tmp = {}

for pmodel in models_with_parameters:
    model_name = pmodel['model_name']
    tmp['Model'] = model_name
    model = pmodel['model']()
    parameters = pmodel['params']
    
    print(f"Calculating {model_name} model")
    
    print("Start training model...")
    if os.path.exists('ParamModels/{}_model.npy'.format(model_name)) and model_name != 'GBM' and model_name != 'RandomForest':
        try:
            model = np.load('ParamModels/{}_model.npy'.format(model_name), allow_pickle=True).item()
        except ValueError:
            model.fit(X2D_train, Y_train)
            np.save(f"ParamModels/{model_name}_model.npy", model)
            
    else:
        classifier = GridSearchCV(model, parameters, cv=2, iid=False, n_jobs=4)
        classifier.fit(X2D_train, Y_train)
        
        grided_params = classifier.best_params_
        
        model.set_params(**grided_params)
        model.fit(X2D_train, Y_train)
        np.save(f"ParamModels/{model_name}_model.npy", model)
    
    print("Calculating scores....")
    tmp['Accuracy'] = accuracy_score(Y_test, model.predict(X2D_test))
    tmp['Precision'] = precision_score(Y_test, model.predict(X2D_test), average='weighted')
    tmp['Recall'] = recall_score(Y_test, model.predict(X2D_test), average='weighted')
    tmp['F1_Score'] = f1_score(Y_test, model.predict(X2D_test), average='weighted')
    
    print(f"Accuracy: {tmp['Accuracy']}\nPrecision: {tmp['Precision']}\nRecall: {tmp['Recall']}\nF1 Score: {tmp['F1_Score']}")
    
    TestModels = TestModels.append([tmp])

TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.Accuracy.plot(ax=axes, kind='bar', title='Accuracy')
plt.show()

#TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.Precision.plot(ax=axes, kind='bar', title='Precision')
plt.show()

#TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.Recall.plot(ax=axes, kind='bar', title='Recall')
plt.show()

#TestModels.set_index('Model', inplace=True)
fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
TestModels.F1_Score.plot(ax=axes, kind='bar', title='F1_Score')
plt.show()

Optimization of hyper parameters gave little, the effectiveness of most models remained in place. None of the classical machine learning models matched the effectiveness of the simple neural network model but remained at a satisfactory level of about 95 f1 score.