### Importamos as bilbiotecas

In [None]:
                                                                                import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.naive_bayes import GaussianNB

### Carregamos o dataset
**Renomeamos as colunas**

In [None]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("magic04.data", names=cols)
df.head()

### Transformamos a variável categórica em numérica

In [None]:
df["class"] = (df["class"] == "g").astype(int)

In [None]:
df.head()

### Visualizamos as distribuições 

**Criamos uma matriz de gráficos de dispersão:**

In [None]:
import seaborn as sns
dados = df.iloc[:, :-1].copy()
sns.pairplot(data=dados)

**Calculamos a matriz de correlação:**

In [None]:
corr = df.drop(labels='class',axis=1).corr()
sns.heatmap(data=corr,xticklabels=corr.columns,yticklabels=corr.columns,linewidths=0.5,annot=True)

### Separação do dataset em treinamento, validação e teste

In [None]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

### Conferimos o balanceamento dos dados

In [None]:
print(len(train[train["class"]==1]))

In [None]:
print(len(train[train["class"]==0]))

### Padronização e balanceamento dos dados

Além da padronização dos dados, utilizamos o método *RandomOverSampler* para reamostragem dos dados desbalanceados.

In [None]:
    def scale_dataset(dataframe, oversample=False):
      X = dataframe[dataframe.columns[:-1]].values
      y = dataframe[dataframe.columns[-1]].values
    
      scaler = StandardScaler()
      X = scaler.fit_transform(X)
    
      if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)
    
      data = np.hstack((X, np.reshape(y, (-1, 1))))
    
      return data, X, y

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_valid_pred = knn_model.predict(X_valid)

In [None]:
print(classification_report(y_valid, y_valid_pred))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [None]:
y_valid_pred = nb_model.predict(X_valid)
print(classification_report(y_valid, y_valid_pred))

In [None]:
# Log Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_valid_pred = lg_model.predict(X_valid)
print(classification_report(y_valid, y_valid_pred))

## SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_valid_pred = svm_model.predict(X_valid)
print(classification_report(y_valid, y_valid_pred))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)

In [None]:
y_valid_pred = logreg_model.predict(X_valid)
print(classification_report(y_valid, y_valid_pred))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)

In [None]:
y_valid_pred = rf_model.predict(X_valid)
print(classification_report(y_valid, y_valid_pred))

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_model = AdaBoostClassifier(n_estimators=50)
ada_model.fit(X_train, y_train)

In [None]:
y_valid_pred = ada_model.predict(X_valid)
print(classification_report(y_valid, y_valid_pred))

### Escolha do modelo para teste

In [None]:
y_test_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_test_pred))