# 1- Import required libraries

In [1]:
import sys
sys.path.append('../scripts/')
from data_loading_script import *
from model_training_script import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# 2- Read data

In [2]:
X, y, X_train, X_test, y_train, y_test = load_npy_data(path='../data/cleaned_data',files=['X','y','X_train', 'X_test','y_train', 'y_test'])

In [3]:
(X_train[:10], y_train[:10])

(array([[0.        , 0.09782522, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.01870017, 0.01072927, ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.02631762, 0.        , ..., 0.01376593, 0.057903  ,
         0.04811406],
        [0.        , 0.        , 0.0269021 , ..., 0.02452566, 0.        ,
         0.05714733]]),
 array([ 2,  2,  1,  2, 15, 20, 20,  2,  2, 15]))

# 3-  Model Training

In [4]:
models={
    "Decision Tree":DecisionTreeClassifier(),
    "Naive Bayes":MultinomialNB()
}

In [5]:
fit_and_score(models, X_train, X_test, y_train, y_test)

{'Decision Tree': 0.6, 'Naive Bayes': 0.78}

# 4- Hyperparameter Tuning

In [6]:
tree_param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', 1,2 , 4, None],
}
best_tree_model = tune_hyperparameters(models['Decision Tree'], X,  y, tree_param_dist)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [7]:
naive_bayes_param_dist = {
    'alpha': [0.1, 0.5, 1.0],       
    'fit_prior': [True, False]
}
best_naive_bayes_model = tune_hyperparameters(models['Naive Bayes'], X,  y, naive_bayes_param_dist)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [8]:
models={
    "best_naive_bayes_model":best_naive_bayes_model,
    "best_tree_model":best_tree_model
}

In [9]:
fit_and_score(models, X_train, X_test, y_train, y_test)

{'best_naive_bayes_model': 0.8, 'best_tree_model': 0.66}

# 5- Save Models

In [12]:
save_model(best_tree_model, '../models/decision_tree.pkl')
save_model(best_naive_bayes_model, '../models/naive_bayes.pkl')