# Laboratorio 2 - Informe

### Grupo 4:
     - S. Calvo C.I 5.711.417-7     
     - X. Iribarnegaray C.I 5.253.705-9
     - J. Simonelli C.I 5.405.358-4

## 1. Objetivos
El objetivo de este laboratorio es:
- Implementar el algoritmo Naive Bayes
- Aplicar herramientas de metodología
- Analizar los resultados obtenidos.

## 2. Diseño
### 2.1 Algoritmo

### Explicar Naive Bayes logaritmico, y presentar el otro analogo ###

In [11]:
import numpy as np
from naive_bayes import init

dataset, features, continuous_features, target = init()

def naive_bayes(dataset, target, features, instance, m):
    dataset_size = dataset.shape[0]
    prob_1 = dataset[target].value_counts()[1]/dataset_size
    prob_0 = dataset[target].value_counts()[0]/dataset_size
    
    sum_1 = np.log(prob_1)
    sum_0 = np.log(prob_0)
    
    for feature in features:
        examples = dataset.loc[dataset[feature] == instance[feature]][target].value_counts()
        
        # if no instances with a specific target value is found, the get method will return 0
        count_1 = examples.get(1, default=0)
        count_0 = examples.get(0, default=0)
        
        feature_range = len(dataset[feature].value_counts())
        
        numerator_1 = count_1 + (m / feature_range)
        numerator_0 = count_0 + (m / feature_range)

        # sum of sequence
        sum_1 += np.log( numerator_1 / (dataset[target].value_counts()[1] + m) )
        sum_0 += np.log( numerator_0 / (dataset[target].value_counts()[0] + m) )
        
    # argmax
    if ( sum_1 > sum_0):
        return 1
    else:
        return 0


### 2.3 Evaluación

In [12]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def test_instances(X_train, y_train, X_test, y_test, features, m):
    y_pred = []
    train_ds = X_train.copy()
    train_ds[target] = y_train
    for i in range(0, X_test.shape[0]):
        instance = X_test.iloc[i]
        y_pred.append(naive_bayes(train_ds, target, features, instance, m))
        
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None, zero_division=1)
    recall = recall_score(y_test, y_pred, average=None, zero_division=1)
    f1 = f1_score(y_test, y_pred, average=None, zero_division=1)
    
    return accuracy*100, precision*100, recall*100, f1*100

## 3. Experimentación

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import classification_report, confusion_matrix

# Initialize KBinsDiscretizer for continuous features
kbins = KBinsDiscretizer(n_bins=50, encode='ordinal', strategy='kmeans')

# Discretize continuous features
dataset[continuous_features] = kbins.fit_transform(dataset[continuous_features])

# Prepare X and y
X = dataset.drop([target, 'pidnum'], axis=1)
y = dataset[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# List of m values
m_values = [1, 10, 100, 1000]


# Iterate over m values
for i in range(0, len(m_values)):
    results = pd.DataFrame(columns=['Precision', 'Recall', 'F1'])
    
    m = m_values[i]
    
    y_pred = []
    
    # Copy the training set and append target variable to it
    train_ds = X_train.copy()
    train_ds[target] = y_train
    
    # For each instance in the test set, make a prediction
    for j in range(X_test.shape[0]):
        instance = X_test.iloc[j]
        y_pred.append(naive_bayes(train_ds, target, features, instance, m))
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None)
    recall = recall_score(y_test, y_pred, average=None)
    f1 = f1_score(y_test, y_pred, average=None)
    
    results.loc[0] = {'Precision': precision[0], 'Recall': recall[0], 'F1': f1[0]}
    results.loc[1] = {'Precision': precision[1], 'Recall': recall[1], 'F1': f1[1]}
    
    print(f"m = {m}, Accuracy = {accuracy}\n")
    print(f"{results}\n")



m = 1, Accuracy = 0.8551401869158879

   Precision    Recall        F1
0   0.878338  0.933754  0.905199
1   0.769231  0.630631  0.693069

Confusion Matrix:
[[296  21]
 [ 41  70]]


m = 10, Accuracy = 0.8598130841121495

   Precision    Recall        F1
0   0.885886  0.930599  0.907692
1   0.768421  0.657658  0.708738

Confusion Matrix:
[[295  22]
 [ 38  73]]


m = 100, Accuracy = 0.8411214953271028

   Precision    Recall        F1
0   0.867257  0.927445  0.896341
1   0.741573  0.594595  0.660000

Confusion Matrix:
[[294  23]
 [ 45  66]]


m = 1000, Accuracy = 0.7453271028037384

   Precision    Recall        F1
0   0.745283  0.996845  0.852901
1   0.750000  0.027027  0.052174

Confusion Matrix:
[[316   1]
 [108   3]]




In [36]:

from matplotlib import pyplot as plt
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, cross_validate

class CustomNaiveBayes(BaseEstimator, ClassifierMixin):
    def __init__(self, features, m):
        self.features = features
        self.m = m

    def fit(self, X_train, y_train):
        self.classes_ = np.unique(y_train)
        self.X_train = X_train.copy()
        self.y_train = y_train.copy()
        self.X_train[target] = y_train
        return self

    def predict(self, X_test):
        y_pred = []
        for i in range(0, X_test.shape[0]):
            instance = X_test.iloc[i]
            y_pred.append(naive_bayes(self.X_train, target, self.features, instance, self.m))
        return np.array(y_pred)
    
    
    def __sklearn_clone__(self):
        return self

results = pd.DataFrame(columns=['k', 'Accuracy', 'Precision', 'Recall', 'F1'])

k_range = [5, 10, 15, 20, 23]

scoring = {
    'accuracy': 'accuracy',                             # Accuracy score
    'precision_micro': make_scorer(precision_score, average='micro'),  # micro precision
    'recall_micro': make_scorer(recall_score, average='micro'),        # micro recall
    'f1_micro': make_scorer(f1_score, average='micro')                 # micro F1 score
}

for i in k_range:
    selector = SelectKBest(chi2, k=i)
    X_train_selected = selector.fit_transform(X_train, y_train)
    selected_features = X_train.columns[selector.get_support()]
    model = CustomNaiveBayes(selected_features, m=10)
    X_selected = X[selected_features]

    scores = cross_validate(model, X_selected, y, cv=5, scoring=['accuracy', 'precision_weighted', 'recall_weighted','f1_weighted'] )
    
    results.loc[results.shape[0]] = [i, scores['test_accuracy'].mean(), scores['test_precision_weighted'].mean(), scores['test_recall_weighted'].mean(), scores['test_f1_weighted'].mean()]
    
    print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, model.predict_y(X_selected))}\n\n")
print(results)
    
    # # Generate a sequence of x values corresponding to the number of features selected
    # x_values = range(1, 24)  # From 1 to 23 features

    # # Plot the accuracy values
    # plt.plot(x_values, acc, marker='.', linestyle='-', color='b')

    # # Add labels and title
    # plt.xlabel('Number of Selected Features')
    # plt.ylabel('Cross-Validated F1 Score')
    # plt.title('F1 Score vs. Number of Selected Features')

    # # Display the plot
    # plt.show()

      k  Accuracy  Precision    Recall        F1
0   5.0  0.845243   0.841391  0.845243  0.842614
1  10.0  0.848975   0.845148  0.848975  0.846223
2  15.0  0.847574   0.845527  0.847574  0.846119
3  20.0  0.854585   0.851199  0.854585  0.852014
4  23.0  0.849910   0.846376  0.849910  0.847258
