# <center> <img src="figs/LogoUFSCar.jpg" alt="Logo UFScar" width="110" align="left"/>  <br/> <center>Universidade Federal de São Carlos (UFSCar)<br/><font size="4"> Departamento de Computação, campus Sorocaba</center></font>
</p>

<font size="4"><center><b>Disciplina: Aprendizado de Máquina</b></center></font>
  
<font size="3"><center>Prof. Dr. Tiago A. Almeida</center></font>

## <center>Projeto Final</center>

**Aluno**: Eduardo Garcia do Nascimento

**RA/CPF**: 22008732800


---
### Análise exploratória

Nesta seção, deve ser feita a leitura da base de dados e todas as análises necessárias para entendê-la melhor, tais como:
* Significado de cada atributo
* Medidas descritivas
* Gráficos

In [None]:
# -*- coding: utf-8 -*-

# Caminho dos arquivos
FILES_DIRECTORY = "data"

import numpy as np 
import pandas as pd 
import os
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from scripts import utils

if __name__ == '__main__':
                       
    # importa o arquivo e guarda em um dataframe do Pandas
    set1_dataset  = pd.read_csv(os.path.join(FILES_DIRECTORY, 'set1.csv'), sep=',', low_memory=False)
    set2_dataset  = pd.read_csv(os.path.join(FILES_DIRECTORY, 'set2.csv'), sep=',', low_memory=False) 
    set3_dataset  = pd.read_csv(os.path.join(FILES_DIRECTORY, 'set3.csv'), sep=',', low_memory=False)
    
    # Renomeia colunas concatenando o setX antes de fazer o merge para identificá-las posteriormente
    cols = set1_dataset.columns
    for col in cols:
        set1_dataset = set1_dataset.rename(columns={col:'set1_'+col})
        
    cols = set2_dataset.columns
    for col in cols:
        set2_dataset = set2_dataset.rename(columns={col:'set2_'+col})
    
    cols = set3_dataset.columns
    for col in cols:
        set3_dataset = set3_dataset.rename(columns={col:'set3_'+col})

    # Concatena os datasets em somente um dataset único
    frames = [ set1_dataset, set2_dataset, set3_dataset ]
    input_dataset = pd.concat(frames, axis=1)
    
    # Remove os atributos que são constantes e não oferecem nenhum valor aos algoritmos de classificação
    print('Removendo atributos com baixa variância....................................')
    variance_mask = VarianceThreshold().fit(input_dataset).get_support()
    input_dataset = input_dataset.iloc[:,variance_mask]
    print('Atributos removidos por baixa variância: %d' % np.sum(~variance_mask))
    
    # Tratamento de outliers e entradas nulas
    print('Removendo outliers e substituindo por valores nulos.........................')
    df = input_dataset.copy()
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    mask = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
    df[mask] = np.nan
    print('Preenchendo valores nulos com a média dos atributos..........................')
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    input_dataset.loc[:,:] = imp_mean.fit_transform(input_dataset)
    
    # Normalização dos dados entre 0 e 1
    input_dataset.loc[:,:] = MinMaxScaler().fit_transform(input_dataset)
    
    # Adiciona as classes junto ao dataset de atributos
    train_dataset = pd.read_csv(os.path.join(FILES_DIRECTORY, 'train.csv'), sep=',')
    input_dataset['classe'] = np.nan
    input_dataset.loc[train_dataset['Id'].values,'classe'] = train_dataset['Class'].values
    
    backup_dataset = input_dataset.copy()
    
    mask = ((input_dataset['classe'] == -1) | (input_dataset['classe'] == 1))
    display(input_dataset.loc[mask].head(10))
    
    cols = list(input_dataset.columns)
    cols.remove('classe')
        
    # Seleciona os melhores atritubos para treinametno do algoritmo de classificação
    print('Selecionando melhores features....................................')
    selector = SelectKBest(f_classif, k=6).fit(
        input_dataset.loc[(input_dataset['classe'] == -1) | (input_dataset['classe'] == 1), cols].values,
        input_dataset.loc[(input_dataset['classe'] == -1) | (input_dataset['classe'] == 1),'classe'].values)
    mask = selector.get_support()
    mask = np.append(mask, True)
    input_dataset = input_dataset.iloc[:,mask]
    
    # matriz de gráficos scatter 
    sns.pairplot(input_dataset, hue='classe', height=3.5);
    plt.show()
    
    # matrizes de covariancia e correlação
    df_covariance = input_dataset.iloc[:,:-1].cov()
    df_correlation = input_dataset.iloc[:,:-1].corr()
    
    
    # cria um mapa de cores dos valoes da covariancia
    sns.heatmap(df_covariance, annot=True, xticklabels=df_correlation.columns, yticklabels=df_correlation.columns)
    plt.title('Covariância')
    plt.show()

    # cria um mapa de cores dos valoes da correlação
    sns.heatmap(df_correlation, annot=True, xticklabels=df_correlation.columns, yticklabels=df_correlation.columns)
    plt.title('Correlação')
    plt.show()
    
    display(input_dataset.info())
    
    print('Dados concatenados produzindo um total de %d atributos' % 
            input_dataset.loc[(input_dataset['classe'] == -1) | (input_dataset['classe'] == 1)].shape[1])
    
    print('O número de amostras com classificação válida é: %d' % 
            input_dataset.loc[(input_dataset['classe'] == -1) | (input_dataset['classe'] == 1)].shape[0])

    print('Dados de treinamento carregados com sucesso!')

    test_dataset  = pd.read_csv(os.path.join(FILES_DIRECTORY, 'test.csv'), sep=',')
    K = input_dataset.loc[test_dataset.iloc[:,:].values.T[0]]
    K = K.drop('classe', axis=1).values
    
    print('Análise e visualização dos dados:')
    y = input_dataset.classe
    ax = sns.countplot(y, label="Contagem")
    N,U,P = y.value_counts()
    print('Número de posts comuns:', N)
    print('Número de posts não reconhecidos:', U)
    print('Número de posts phishing', P)
    plt.show()
    
    display(input_dataset.describe())
    
    cl = input_dataset.columns
    cl = cl.drop('classe')
    
 
    plt.figure(figsize=(20,10))
    data = pd.melt(input_dataset.loc[(input_dataset['classe'] == -1) | (input_dataset['classe'] == 1)],
                   id_vars="classe", var_name="features", value_name='value')
    sns.boxplot(x='features', y='value', hue='classe', data=data)
    plt.show()
    
    plt.figure(figsize=(20,10))
    sns.violinplot(x='features', y='value', hue='classe', data=data, split=True, inner="quartile")
    plt.show()
    
    X = input_dataset.loc[(input_dataset['classe'] == -1) | (input_dataset['classe'] == 1)]
    X = X.drop('classe', axis=1).values
    y = input_dataset.loc[((input_dataset['classe'] == -1) | (input_dataset['classe'] == 1)), 'classe'].values
    
    utils.printPCA(X,y)    
    
    utils.beep(1, 400)

---
### Pré-processamento

Nesta seção, as funções da etapa de pré-processamento dos dados devem ser implementadas e aplicadas (se necessário)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.semi_supervised import LabelSpreading
from sklearn.ensemble import RandomForestClassifier
from scripts import utils

  
X, y = utils.remove_outliers(X, y)

print(X.shape, y.shape)
print('Separando a base em treino e teste')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=0, stratify=y)



utils.printPCA(X_train, y_train)
utils.printPCA(X,y)
utils.printBoxPlot(X)

utils.beep(2, 600)



---
### Experimento

Nesta seção, o experimento deve ser conduzido, utilizando os protocolos experimentais ensinados no curso e executando os métodos inteligentes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import  plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.semi_supervised import LabelSpreading
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.utils import resample


scoring_list=['roc_auc', 'f1', 'f1_micro', 'f1_macro', 'balanced_accuracy']
results = pd.DataFrame()

# Pré balanceamento dos dados utilizando a técnica de oversampling
X_train, y_train = utils.balance_classes(X_train, y_train)
X_bal, y_bal = utils.balance_classes(X, y)

# População dos dados não classificados com o melhor classificador encontrado com os resultados mais confiáveis
X_semi_df = backup_dataset[(backup_dataset.classe!=-1)&(backup_dataset.classe!=1)&(backup_dataset.classe!=0)]
X_semi_df = X_semi_df.iloc[:,mask]
model = svm.SVC(kernel='poly', class_weight='balanced', decision_function_shape='ovr', probability=True,random_state=1)
clf = model.fit(X_train, y_train)
y_semi = clf.predict(X_semi_df.drop('classe', axis=1).values)
y_probas = clf.predict_proba(X_semi_df.drop('classe', axis=1).values)
proba_mask = (y_probas[:,0] < 0.2) | (y_probas[:,0] > 0.8)
y_semi = y_semi[proba_mask]
X_semi_df = X_semi_df[proba_mask]
X_semi = X_semi_df.drop('classe', axis=1).values

print('Número de amostras acrescentadas ao dataset de treinamento:', X_semi.shape[0], 
                                                                      np.sum(y_semi==-1), np.sum(y_semi==1))
X_train = np.concatenate([X_train, X_semi], axis=0)
y_train = np.concatenate([y_train, y_semi], axis=0)
X = np.concatenate([X, X_semi], axis=0)
y = np.concatenate([y, y_semi], axis=0)

# Configura K folds estratificados, ou seja, mantendo as mesmas proporções entre classes
cv = StratifiedKFold(n_splits=4, random_state=1, shuffle=True)

clfname = 'SVM poly'
print(clfname + '------------------------------------------------------------------------------------')
model = svm.SVC(kernel='poly', class_weight='balanced', decision_function_shape='ovr', probability=True,random_state=1)
utils.evaluate_model(model, X_train, y_train, X_test, y_test)
for scoring in scoring_list:
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    print('Cross ' + scoring + ': %.3f (%.3f)------------------' % (np.mean(scores), np.std(scores)))
    results.loc[clfname, scoring] = np.mean(scores)
    
clfname = 'SVM linear'
print(clfname + '------------------------------------------------------------------------------------')
model = svm.SVC(kernel='linear', class_weight='balanced', decision_function_shape='ovr', probability=True,random_state=1)
utils.evaluate_model(model, X_train, y_train, X_test, y_test)
for scoring in scoring_list:
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    print('Cross ' + scoring + ': %.3f (%.3f)------------------' % (np.mean(scores), np.std(scores)))
    results.loc[clfname, scoring] = np.mean(scores)
    
clfname = 'SVM rbf'
print(clfname + '------------------------------------------------------------------------------------')
model = svm.SVC(kernel='rbf', class_weight='balanced', decision_function_shape='ovr', probability=True,random_state=1)
utils.evaluate_model(model, X_train, y_train, X_test, y_test)
for scoring in scoring_list:
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    print('Cross ' + scoring + ': %.3f (%.3f)------------------' % (np.mean(scores), np.std(scores)))
    results.loc[clfname, scoring] = np.mean(scores)
    
clfname = 'SVM sigmoid'
print(clfname + '------------------------------------------------------------------------------------')
model = svm.SVC(kernel='sigmoid', class_weight='balanced', decision_function_shape='ovr', probability=True,random_state=1)
utils.evaluate_model(model, X_train, y_train, X_test, y_test)
for scoring in scoring_list:
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    print('Cross ' + scoring + ': %.3f (%.3f)------------------' % (np.mean(scores), np.std(scores)))
    results.loc[clfname, scoring] = np.mean(scores)

print('Printing model to submission.csv ###################################################################')
clf= model.fit(X, y)
y_pred_submission = clf.predict_proba(K)[:,1]
result = np.zeros((K.shape[0],2))
for i in range(K.shape[0]):
    result[i][0] = test_dataset.iloc[:,:].values.T[0][i]
    result[i][1] = y_pred_submission[i]
resultdf = pd.DataFrame(data=result, columns=["Id", "Predicted"])
resultdf['Id'] = resultdf['Id'].astype(int)
resultdf['Predicted'] = resultdf['Predicted'].round(decimals=5)
resultdf.to_csv('submission.csv', index=False, float_format='%.5f')
print('####################################################################################################')

clfname = 'RandomForestClassifier'
print(clfname + '------------------------------------------------------------------------------------')
model = RandomForestClassifier(max_depth=2, class_weight='balanced', random_state=1)
utils.evaluate_model(model, X_train, y_train, X_test, y_test)
for scoring in scoring_list:
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    print('Cross ' + scoring + ': %.3f (%.3f)------------------' % (np.mean(scores), np.std(scores)))
    results.loc[clfname, scoring] = np.mean(scores)

clfname = 'LogisticRegression'
print(clfname + '------------------------------------------------------------------------------------')
model = LogisticRegression(random_state=1, class_weight='balanced', max_iter=15000)
utils.evaluate_model(model, X_train, y_train, X_test, y_test)
for scoring in scoring_list:
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    print('Cross ' + scoring + ': %.3f (%.3f)------------------' % (np.mean(scores), np.std(scores)))
    results.loc[clfname, scoring] = np.mean(scores)

X_train, y_train = utils.balance_classes(X_train, y_train)

clfname = 'MLPClassifier'
print(clfname + '------------------------------------------------------------------------------------')
model = MLPClassifier( alpha=1e-5, hidden_layer_sizes=(600,), random_state=1, max_iter=5000, )
utils.evaluate_model(model, X_train, y_train, X_test, y_test)
for scoring in scoring_list:
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    print('Cross ' + scoring + ': %.3f (%.3f)------------------' % (np.mean(scores), np.std(scores)))
    results.loc[clfname, scoring] = np.mean(scores)

clfname = 'MultinomialNB'
print(clfname + '------------------------------------------------------------------------------------')
model = MultinomialNB()
utils.evaluate_model(model, X_train, y_train, X_test, y_test)
for scoring in scoring_list:
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    print('Cross ' + scoring + ': %.3f (%.3f)------------------' % (np.mean(scores), np.std(scores)))
    results.loc[clfname, scoring] = np.mean(scores)

clfname = 'KNeighborsClassifier'
print(clfname + '------------------------------------------------------------------------------------')
model = KNeighborsClassifier(weights='distance')
utils.evaluate_model(model, X_train, y_train, X_test, y_test)
for scoring in scoring_list:
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    print('Cross ' + scoring + ': %.3f (%.3f)------------------' % (np.mean(scores), np.std(scores)))
    results.loc[clfname, scoring] = np.mean(scores)

display(results)
    
utils.beep(3, 800)



---
### Análise dos Resultados

Nesta seção, os resultados devem ser exibidos e comparados, através de tabelas e gráficos

asdfqwerqwrqwerqwer
qeqwerqwer
