<a href="https://colab.research.google.com/github/ehsan74814/Preprocessing_Data/blob/main/NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
df = pd.read_csv("/content/drive/MyDrive/Training ML/naiveBayes/agaricus-lepiota.data")

In [None]:
df.columns

In [6]:
columns_names = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill spacing', 'gill-size', 'gill-color', 'stalk-shape',
                 'stalk-root', 'stalk-surface e-above-ring', 'stalk-surface e-below-ring', 'stalk-color-a bove-ring', 'stalk-color- below-ring',
                 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

In [7]:
df.columns = columns_names

In [None]:
df.dtypes

# preprocessing data

##A) Removing low value columns

### 1) removing columns with zero variance

In [9]:
# columns where all values are the same.

columns_to_drop = df.columns[df.nunique()<=1]
df = df.drop(columns=columns_to_drop)


print(f"Removing columns : {columns_to_drop}")

Removing columns : Index(['veil-type'], dtype='object')


### 2) Removing columns with a large number of mission values

In [10]:
# calculating the percentage of missing values in each column

missing_percentage = df.isin(['?']).sum()/ len(df) * 100

# identify columns with more than 30% missing values
columns_to_drop = missing_percentage[missing_percentage>30].index

# drop the indentified columns from the dataframe
df = df.drop(columns=columns_to_drop)

print(f'Droped columns: {columns_to_drop}')

Droped columns: Index(['stalk-root'], dtype='object')


### 3) Remove columns with high correlation

In [None]:
# calculating the correlation matrix
corr_matrix = df.corr().abs()

## label encodig

In [None]:
from sklearn.preprocessing import LabelEncoder

# convert all object colums to numeric values using label encoding
df_encoded = df.apply(LabelEncoder().fit_transform)
df = df_encoded
# display the first 5 rows of the encoded dataframe
print(df.head())

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', cbar=True)
plt.title('correlation heatmap for mushroom dataset')
plt.show()

## checking in not allowed

In [None]:
# display unique values in each colum
for column in df.columns:
  print(f'Unique values in {column}: {df[column].unique}')

In [None]:
df  = pd.DataFrame(df)

In [None]:
df = df.drop('veil-color', axis=1)

# train and test split

In [25]:
X = df.drop('class', axis=1)
y = df['class']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)

#model


In [None]:
# initialize the models
models = {
    'GaussianNB' : GaussianNB(),
    'MultinomialNB': MultinomialNB(),
    'BernoulliNB': BernoulliNB()
}


# Train and evaluat each model
for model_name, model in models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)

  print(f'Accuracy of {model_name}: {accuracy * 100:.2f}%')