!pip install pandas, numpy, scikit-learn, plotly
1. Import all the necessary libraries
 

In [45]:

import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import plotly.express as px
from pycaret.classification import *
from pprint import pprint


This function will clean the data, by that is ment that nan values will be removed and data will be split in X and y

In [46]:
# Cleaning Function
def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.core.series.Series]:
    """
    Clean the data by removing the missing values.
    Args:
        data (pd.DataFrame): dataframe with the data

    Returns:
        tuple[pd.DataFrame, pd.core.series.Series]: tuple with the cleaned data, first element is the dataframe with the variables and the second element is the series with the labels
    """
    data = df.copy()
    data = data.drop(
        13, axis=1
    )  # we are removing column 13 because it has 10k missing values
    data = data.dropna(axis=0)  # we are removing all rows with missing values
    X = data.iloc[:, 0:15]  # we are creating a dataframe with all the variables
    y = data.iloc[:, 15]  # we are creating a series with the labels
    return X, y  # return the cleaned data

2. Import the data

In [47]:
train = pd.read_csv(
    "https://onedrive.live.com/download?resid=4C66E14E953F6D39!9922&authkey=!AJL6R66cYHl1E0Q",
    header=None,
)
# Also I loaded the test data, but i will use it later for testing
test = pd.read_csv(
    "https://onedrive.live.com/download?resid=4C66E14E953F6D39!9921&authkey=!AP0enoDbayF1mbE",
    header=None,
)
X_train, y_train = clean_data(train)
X_test, y_test = clean_data(test)

3. kNN

3.1. Find the best number of neighbors

In [48]:
index = np.arange(1, 20, 2)
test_accuracy = np.array([])
train_accuracy = np.array([])
for i in np.arange(1, 20, 2):
    # 3. Create the model
    model = KNeighborsClassifier(
        n_neighbors=i,
    )

    # 4. Train the model
    model.fit(X_train, y_train)

    # 5. Test the model
    train_accuracy = np.append(train_accuracy, model.score(X_train, y_train))
    test_accuracy = np.append(test_accuracy, model.score(X_test, y_test))
df = pd.DataFrame(
    {
        "Number of neighbors": index,
        "Accuracy on train": train_accuracy,
        "Accuracy on test": test_accuracy,
    }
)
print(df)

   Number of neighbors  Accuracy on train  Accuracy on test
0                    1           1.000000          0.884735
1                    3           0.937410          0.906765
2                    5           0.918490          0.909657
3                    7           0.911323          0.911660
4                    9           0.906355          0.909657
5                   11           0.899570          0.908767
6                   13           0.895461          0.902759
7                   15           0.892403          0.901202
8                   17           0.888963          0.902092
9                   19           0.883421          0.897419


3.2. Plot the results to visualize the best number of neighbors

In [49]:
fig = px.line(
    df, x="Number of neighbors", y=["Accuracy on train", "Accuracy on test"],
    title="layout.hovermode='x'"
)  # create the plot
fig.update_traces(mode="markers+lines", hovertemplate=None)  # add the markers and the lines
fig.update_layout(
    title="Accuracy of the model",
    xaxis_title="Number of neighbors",
    yaxis_title="Accuracy",
    hovermode="x",
)  # add the title and the axis labels
# set the scale of the x axis
fig.show()

3.3 Find the best metric

In [50]:
# Considering that our data constains only numerical values, we can pick metrics that are suitable for numerical data
metrics = ["euclidean", "manhattan", "chebyshev", "minkowski"]
out = []
for metric in metrics:
    model = KNeighborsClassifier(n_neighbors=7, metric=metric, n_jobs=-1)
    model.fit(X_train, y_train)
    out.append((metric, model.score(X_test, y_test)))
# sort the list by the accuracy
out.sort(key=lambda x: x[1], reverse=True)
pprint(out)

[('manhattan', 0.9127725856697819),
 ('euclidean', 0.9116599910992434),
 ('minkowski', 0.9116599910992434),
 ('chebyshev', 0.8851802403204272)]


3.4 Create the best model

In [51]:
best_knn = KNeighborsClassifier(n_neighbors=7, metric="manhattan", n_jobs=-1)
best_knn.fit(X_train, y_train)
pred = best_knn.predict(X_test)

3.5 Print the results of the best model


In [52]:
print(
f"""
Accuracy: {accuracy_score(pred, y_test)}
Precision: {precision_score(pred, y_test)}
Recall: {recall_score(pred, y_test)}
F1: {f1_score(pred, y_test)}
"""
)


Accuracy: 0.9127725856697819
Precision: 0.9399193548387097
Recall: 0.9055944055944056
F1: 0.922437673130194



4. Caret

In [63]:
# prepare the train and test data for pycaret
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
# convvert the last column to categorical
train.iloc[:, -1] = train.iloc[:, -1].astype("category")
test.iloc[:, -1] = test.iloc[:, -1].astype("category")

0      float64
1      float64
2      float64
3      float64
4      float64
5      float64
6      float64
7      float64
8      float64
9      float64
10     float64
11     float64
12     float64
14     float64
15     float64
16    category
dtype: object


In pycaret everything works around the concept of experiment
so as first step we will create this experimetn
in experiment is mandatory to specify the data, target value and the categorical features
In order to specify the categorical features we have to create a list with column names that are categorical,
then in the setup to write the parameter categorical_features = list_of_categorical_features

In [68]:
experiment = setup(
    data=train,  # the dataframe with the data
    target=15  # the last column is the target
    # categorical_features= [col1, col3, col5]  # columns with categorical data
)  # setup the experiment

Unnamed: 0,Description,Value
0,Session id,1632
1,Target,16
2,Target type,Binary
3,Target mapping,"1: 0, 2: 1"
4,Original data shape,"(10465, 16)"
5,Transformed data shape,"(10465, 16)"
6,Transformed train set shape,"(7325, 16)"
7,Transformed test set shape,"(3140, 16)"
8,Numeric features,15
9,Preprocess,True


In [69]:
best_model = compare_models()  # in this step we are comparing all the models for classification and we are picking the best one
# we will get a small table with all the models used and the all the metrics for each model
# the output is really wierd because the RF, DT and ada have very shitty metrics, considerably lower then the output we had when did it alone


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.8457,0.9186,0.7973,0.8503,0.8228,0.6865,0.6877,0.17
et,Extra Trees Classifier,0.839,0.9206,0.748,0.8759,0.8067,0.6703,0.6766,0.552
xgboost,Extreme Gradient Boosting,0.8265,0.909,0.7818,0.8232,0.8018,0.6477,0.6486,0.748
lightgbm,Light Gradient Boosting Machine,0.8258,0.904,0.7684,0.8314,0.7984,0.6455,0.6473,0.227
rf,Random Forest Classifier,0.8242,0.9045,0.7416,0.8481,0.791,0.6405,0.6451,0.937
gbc,Gradient Boosting Classifier,0.7636,0.8424,0.6696,0.7738,0.7176,0.5161,0.5204,1.473
dt,Decision Tree Classifier,0.7311,0.7286,0.7049,0.6995,0.7019,0.4569,0.4573,0.155
ada,Ada Boost Classifier,0.7122,0.7797,0.6362,0.6973,0.6651,0.4137,0.4153,0.453
lr,Logistic Regression,0.6164,0.6507,0.4474,0.5974,0.5116,0.2063,0.2122,0.879
lda,Linear Discriminant Analysis,0.6146,0.652,0.4386,0.5966,0.5054,0.2016,0.2081,0.143


In [70]:
# in case we dont like the output, we can tune the model
best_model = tune_model(best_model, optimize="Accuracy")  # we are tuning the model to get the best accuracy

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8445,0.9362,0.7933,0.8502,0.8208,0.6837,0.685
1,0.8718,0.9447,0.7903,0.9123,0.8469,0.7375,0.7432
2,0.8363,0.9221,0.7812,0.8426,0.8107,0.6669,0.6683
3,0.8581,0.9339,0.7872,0.884,0.8328,0.7103,0.7139
4,0.8622,0.9388,0.8085,0.875,0.8404,0.7195,0.7213
5,0.8402,0.9211,0.7781,0.8533,0.814,0.6744,0.6766
6,0.8648,0.9352,0.8116,0.8783,0.8436,0.7248,0.7266
7,0.8415,0.9278,0.7629,0.8685,0.8123,0.6762,0.6804
8,0.862,0.9359,0.8116,0.8725,0.8409,0.7194,0.7209
9,0.8525,0.9348,0.7812,0.8771,0.8264,0.6988,0.7024


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8445,0.9362,0.7933,0.8502,0.8208,0.6837,0.685
1,0.8718,0.9447,0.7903,0.9123,0.8469,0.7375,0.7432
2,0.8363,0.9221,0.7812,0.8426,0.8107,0.6669,0.6683
3,0.8581,0.9339,0.7872,0.884,0.8328,0.7103,0.7139
4,0.8622,0.9388,0.8085,0.875,0.8404,0.7195,0.7213
5,0.8402,0.9211,0.7781,0.8533,0.814,0.6744,0.6766
6,0.8648,0.9352,0.8116,0.8783,0.8436,0.7248,0.7266
7,0.8415,0.9278,0.7629,0.8685,0.8123,0.6762,0.6804
8,0.862,0.9359,0.8116,0.8725,0.8409,0.7194,0.7209
9,0.8525,0.9348,0.7812,0.8771,0.8264,0.6988,0.7024


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [79]:
pred = predict_model(best_model, test)

In [88]:
# print the metricst
print(f"""
Accuracy: {accuracy_score(pred['16'], pred['prediction_label'])}
Recall: {recall_score(pred['16'], pred['prediction_label'])}
Precision: {precision_score(pred['16'], pred['prediction_label'])}
F1: {f1_score(pred['16'], pred['prediction_label'])}
""")


Accuracy: 0.8969737427681352
Recall: 0.9254032258064516
Precision: 0.8919549164399534
F1: 0.90837126459529



In [89]:
# also to save the model
save_model(model=best_model, model_name="best_model")
# and if we want to load it
model = load_model("best_model")
# and if we want to use it
model.predict(X_test)

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded


0       1
1       2
2       2
3       1
4       1
       ..
4489    2
4490    2
4491    2
4492    1
4493    1
Name: 16, Length: 4494, dtype: int64