# Mushroom Classification Pre Processing

## Load the requirements

In [None]:
# run the requirements.txt file for install the necessary packages
pip install -r mushroom_requirements.txt

## Import Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import collections

from sklearn import decomposition, svm, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import f_classif, SelectKBest, chi2
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from pingouin import multivariate_normality
from factor_analyzer import FactorAnalyzer

#other libraries
import math
from collections import Counter

## Palette Colour for Plot

In [None]:

palette_features = ['#E68753', '#409996']

## Load Data

In [None]:
# Import data yang telah diberikan
data = # YOUR CODE HERE


In [None]:
# Print dimensi dari data

# YOUR CODE HERE

In [None]:
# Print ringkasan dari data yang memuat nama kolom, dan tipe data seperti pada gambar dibawah

# YOUR CODE HERE

![image.png](attachment:image.png)

In [None]:
# Mengecek apakah ada missing value pada data
missing_value = ['#NUM!', np.nan, 'NA']

# Read data lagi tetapi menambahkan na_values = missing data pada command pd.read_csv
# YOUR CODE HERE

data.isnull().sum()

In [None]:
# remove missing value
data.dropna( subset=['kolom yang ada missing value'], axis=0, inplace=True)
data.isnull().sum()

In [None]:
# Membuat Correlation Matrix
corr = data.corr()

plt.figure(figsize = (20, 12))
sns.heatmap(corr, annot=True, fmt = ".2f", cmap="BrBG")
plt.show()

In [None]:
# KMO Test
def kmo(dataset_corr):
    #KMO Test
    #inverse of the correlation matrix
    corr_inv = np.linalg.inv(dataset_corr)
    nrow_inv_corr, ncol_inv_corr = dataset_corr.shape
    
    #partial correlation matrix
    A = np.ones((nrow_inv_corr,ncol_inv_corr))
    for i in range(0,nrow_inv_corr,1):
        for j in range(i,ncol_inv_corr,1):
            #above the diagonal
            A[i,j] = - (corr_inv[i,j]) / (math.sqrt(corr_inv[i,i] * corr_inv[j,j]))
            #below the diagonal
            A[j,i] = A[i,j]
    
    #transform to an array of arrays ("matrix" with Python)
    dataset_corr = np.asarray(dataset_corr)
        
    #KMO value
    kmo_num = np.sum(np.square(dataset_corr)) - np.sum(np.square(np.diagonal(dataset_corr)))
    kmo_denom = kmo_num + np.sum(np.square(A)) - np.sum(np.square(np.diagonal(A)))
    kmo_value = kmo_num / kmo_denom
    
    
    kmo_j = [None]*dataset_corr.shape[1]
    #KMO per variable (diagonal of the spss anti-image matrix)
    for j in range(0, dataset_corr.shape[1]):
        kmo_j_num = np.sum(dataset_corr[:,[j]] ** 2) - dataset_corr[j,j] ** 2
        kmo_j_denom = kmo_j_num + np.sum(A[:,[j]] ** 2) - A[j,j] ** 2
        kmo_j[j] = kmo_j_num / kmo_j_denom

    
    Result = collections.namedtuple("KMO_Test_Results", ["value", "per_variable"])   
    
    #Output of the results - named tuple    
    return Result(value=kmo_value,per_variable=kmo_j)

In [None]:
kmo = kmo(corr)

In [None]:
kmo[1]
# berdasarkan hasil dibawah ini, kita akan drop kolom dengan nilai KMO dibawah 0.5
# nilai KMO dibawah urut dari kolom 1 sampai 9
# silahkan cari tau kolom mana yang harus di drop

In [None]:
# Lakukan Copy data terlebih dahulu

data = # YOUR CODE HERE

# Drop kolom dengan KMO dibawah 0.5

# YOUR CODE HERE

## Cheking Balance of Y Variable

In [None]:
#plot target variable
sns.set(rc={'axes.facecolor':'#ECECEC'}) #background color of plot
plt.figure(figsize=(12,6))
plt.title("Response Variable", fontsize=15, fontweight='bold', fontname='Arial', ha='center')
ax = sns.countplot(x='class', data=data,hue='class', palette=palette_features)

#add labels to each bar
abs_values = data['class'].value_counts(ascending=False).values
for i, v in enumerate(abs_values):
    if v is not None:
        ax.text(i, v + 0.1, str(v), ha='center', va='bottom', fontsize=10)

#show the plot
plt.show()

## Determine X and Y Variables

In [None]:
columns = data.columns
columns = [c for c in columns if c not in ['class']]
y = data['class'] 
X = data[columns]

## Train Test Splitting

In [None]:
np.random.seed(13)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) #80 % of train data, 20 % of test data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Silahkan save masing masing data X_train, X_test, y_train, y_test ke dalam csv

# YOUR CODE HERE