# Parte 2

In [1]:
%matplotlib inline

from sklearn.datasets import make_blobs, make_moons, make_regression, load_iris
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import copy
import csv

from scipy.stats import multivariate_normal
from sklearn.neighbors.kde import KernelDensity

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

## Base de dados

In [2]:
dados = pd.read_csv('data.csv')

# removed from shape view
dados = dados.drop(columns='REGION-PIXEL-COUNT')
dados = dados.drop(columns='SHORT-LINE-DENSITY-5')
dados = dados.drop(columns='SHORT-LINE-DENSITY-2')

# removed from RGB view
dados = dados.drop(columns='INTENSITY-MEAN')
dados = dados.drop(columns='RAWRED-MEAN')
dados = dados.drop(columns='RAWBLUE-MEAN')
dados = dados.drop(columns='RAWGREEN-MEAN')
dados = dados.drop(columns='EXRED-MEAN')
dados = dados.drop(columns='EXBLUE-MEAN')

dados = np.asarray(dados)
X = np.copy(dados)
y_names = X[:, 0]
X = np.array(X[:, 1:], dtype='f')
labels = set(y_names)
classes_dict = {}

i = 0

for label in labels:
    classes_dict[label] = i
    i += 1

y_numbers = [classes_dict[y_names[i]] for i in range(y_names.shape[0])]
y_numbers = np.array(y_numbers)

## Classificador bayesiano gaussiano

In [3]:
class BayesClassifier:
    mu = None
    cov = None
    n_classes = None
    
    def __init__(self):
        a = None
    
    def pred(self, x):
        prob_vect = np.zeros(self.n_classes)
        
        for i in range(self.n_classes):
            mnormal = multivariate_normal(mean=self.mu[i], cov=self.cov[i])
            
            prior = 1. / self.n_classes
            
            prob_vect[i] = prior * mnormal.pdf(x)
            summ = 0.
            
            for j in range(self.n_classes):
                mnormal = multivariate_normal(mean=self.mu[j], cov=self.cov[j])
                summ += prior * mnormal.pdf(x)
            
            prob_vect[i] = prob_vect[i] / summ
        
        return prob_vect
    
    def fit(self, X, y):
        self.mu = []
        self.cov = []
        self.n_classes = np.max(y) + 1
    
        for c in range(self.n_classes):
            Xc = X[y==c]

            mu_c = np.mean(Xc, axis=0)
            self.mu.append(mu_c)
            
            cov_c = np.zeros((X.shape[1], X.shape[1]))
            
            for i in range(Xc.shape[1]):
                for j in range(Xc.shape[1]):
                    if i == j:
                        cov_c[i,j] = np.inner(Xc[:,i] - mu_c[i], Xc[:,j] - mu_c[j])
            
            cov_c = cov_c / float(X.shape[0])
            self.cov.append(cov_c)
        
        self.mu = np.asarray(self.mu)
        self.cov = np.asarray(self.cov)

### Treinamento e teste

In [4]:
output_1 = []
output_2 = []
output_3 = []

for i in range(30):
    # outputs by iteration
    iter_output_1 = []
    iter_output_2 = []
    iter_output_3 = []

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
    
    for train_index, test_index in skf.split(X, y_numbers):
        view_1 = np.copy(X[:, 0:6])
        view_2 = np.copy(X[:, 6:])
        view_3 = np.copy(X)
        
        X_train, X_test = view_1[train_index], view_1[test_index]
        y_train, y_test = y_numbers[train_index], y_numbers[test_index]
        bc_1 = BayesClassifier()
        bc_1.fit(X_train, y_train)
        
        hit = 0.

        for i in range(X_test.shape[0]):
            ypred = bc_1.pred(X_test[i])

            if np.argmax(ypred) == y_test[i]:
                hit += 1
        
        iter_output_1.append(hit / X_test.shape[0])
        
        X_train, X_test = view_2[train_index], view_2[test_index]
        y_train, y_test = y_numbers[train_index], y_numbers[test_index]
        bc_2 = BayesClassifier()
        bc_2.fit(X_train, y_train)
        
        hit = 0.

        for i in range(X_test.shape[0]):
            ypred = bc_2.pred(X_test[i])

            if np.argmax(ypred) == y_test[i]:
                hit += 1
        
        iter_output_2.append(hit / X_test.shape[0])
        
        X_train, X_test = view_3[train_index], view_3[test_index]
        y_train, y_test = y_numbers[train_index], y_numbers[test_index]
        bc_3 = BayesClassifier()
        bc_3.fit(X_train, y_train)
        
        hit = 0.

        for i in range(X_test.shape[0]):
            ypred = bc_3.pred(X_test[i])

            if np.argmax(ypred) == y_test[i]:
                hit += 1
        
        iter_output_3.append(hit / X_test.shape[0])
    
    output_1.append(iter_output_1)
    output_2.append(iter_output_2)
    output_3.append(iter_output_3)

with open('output/bayesian_v1', 'w') as output_file_1:
    wr = csv.writer(output_file_1, quoting=csv.QUOTE_NONE)
    for row in output_1:
        wr.writerow(row)

with open('output/bayesian_v2', 'w') as output_file_2:
    wr = csv.writer(output_file_2, quoting=csv.QUOTE_NONE)
    for row in output_2:
        wr.writerow(row)

with open('output/bayesian_v3', 'w') as output_file_3:
    wr = csv.writer(output_file_3, quoting=csv.QUOTE_NONE)
    for row in output_3:
        wr.writerow(row)



KeyboardInterrupt: 

## Estimador de Parzen

### Classificador

In [6]:
def get_lines_by_class(X, y):
    unique_class = np.unique(y)
    index = {}
    
    for i in unique_class:
        index[i] = {'x' : X[np.where(y==i)[0]], 'y' : y[np.where(y==i)][0]}
    
    return index

In [7]:
def parzen_estimation(x_train, y_train, h):
    data_by_class = get_lines_by_class(x_train, y_train)
    density = {}
    
    for i in data_by_class:
        kde = KernelDensity(kernel='gaussian', bandwidth=h).fit(data_by_class[i]['x'])
        density[i] = kde
    
    return density

In [8]:
def estimate_bandwidth(x_train, y_train):
    bandwidths = [0.1, 0.8, 1.5, 2.2, 2.9, 3.6, 4.3, 5.0, 5.7, 6.4]
    X_trn, X_validation, y_trn, y_validation = train_test_split(x_train, y_train, test_size=0.20, random_state=0)
    
    precision_of_h = {}
    
    for h in bandwidths:
        #tenho 7 kdes para h
        modelos_kde_hi = parzen_estimation(X_trn, y_trn, h)
        priori_probability = 1. / 7.
        predict_hi, _ = predict(modelos_kde_hi, X_validation, priori_probability)
        precision_of_h[h] = precision(y_validation, predict_hi)

    return max([a for a, b in precision_of_h.items() if b == max([v for idx, v in precision_of_h.items()])])

In [9]:
def precision(y_actual, prediction):
    j = 0
    positive = 0
    
    for i in prediction:
        if i == y_actual[j]:
            positive = positive + 1
        
        j = j + 1
    
    return positive / len(y_actual)

In [10]:
def choose_max_class(results):
    mx = np.max([results[r] for r in results])
    
    for i in results:
        if(len(np.where(results[i]==mx)[0]) > 0):
            return i
    
    return

In [11]:
def predict(models, x_validation, priori):
    posteriori = {}
    prediction_cls = []
    prediction = []
    
    for line in x_validation:
        sum_evidences = np.sum([np.exp(models[values].score_samples(line.reshape((1, line.shape[0]))) \
                                       + np.log(priori)) for values in models])
        
        for i in models:
            posteriori[i] = np.exp(models[i].score_samples(line.reshape((1, line.shape[0]))) \
                                   + np.log(priori)) / sum_evidences
        
        prediction_cls.append(copy.copy(posteriori))
        prediction.append(choose_max_class(posteriori))
    
    return prediction, prediction_cls

### Treinamento e teste

In [12]:
output_1 = []
output_2 = []
output_3 = []

for i in range(30):
    # outputs by iteration
    iter_output_1 = []
    iter_output_2 = []
    iter_output_3 = []
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
    
    for train_index, test_index in skf.split(X, y_numbers):
        # shape view
        view_1 = pd.DataFrame(X).iloc[:, :6].values

        # RGB view
        view_2 = pd.DataFrame(X).iloc[:, 6:].values

        # complete view
        view_3 = X

        bandwidth_1 = estimate_bandwidth(view_1[train_index], y_numbers[train_index])
        model_1 = parzen_estimation(view_1[train_index], y_numbers[train_index], bandwidth_1)
        prediction_1, _ = predict(model_1, view_1[test_index], 1. / 7.)
        hit_rate_1 = precision([z for z in y_numbers[test_index]], prediction_1)
        
        iter_output_1.append(hit_rate_1)

        bandwidth_2 = estimate_bandwidth(view_2[train_index], y_numbers[train_index])
        model_2 = parzen_estimation(view_2[train_index], y_numbers[train_index], bandwidth_2)
        prediction_2, _ = predict(model_2, view_2[test_index], 1. / 7.)
        hit_rate_2 = precision([z for z in y_numbers[test_index]], prediction_2)

        iter_output_2.append(hit_rate_2)

        bandwidth_3 = estimate_bandwidth(view_3[train_index], y_numbers[train_index])
        model_3 = parzen_estimation(view_3[train_index], y_numbers[train_index], bandwidth_3)
        prediction_3, _ = predict(model_3, view_3[test_index], 1. / 7.)
        hit_rate_3 = precision([z for z in y_numbers[test_index]], prediction_3)
        
        iter_output_3.append(hit_rate_3)
    
    output_1.append(iter_output_1)
    output_2.append(iter_output_2)
    output_3.append(iter_output_3)

with open('output/parzen_v1', 'w') as output_file_1:
    wr = csv.writer(output_file_1, quoting=csv.QUOTE_NONE)
    for row in output_1:
        wr.writerow(row)

with open('output/parzen_v2', 'w') as output_file_2:
    wr = csv.writer(output_file_2, quoting=csv.QUOTE_NONE)
    for row in output_2:
        wr.writerow(row)

with open('output/parzen_v3', 'w') as output_file_3:
    wr = csv.writer(output_file_3, quoting=csv.QUOTE_NONE)
    for row in output_3:
        wr.writerow(row)

  # Remove the CWD from sys.path while we load stuff.
  return umr_maximum(a, axis, None, out, keepdims)


KeyboardInterrupt: 

## Classificador por regra da soma

In [13]:
def train_classifiers(X_train, y_train):
    bayesian_model = BayesClassifier()
    bayesian_model.fit(X_train, y_train)
    
    h = estimate_bandwidth(X_train, y_train)
    parzen_model = parzen_estimation(X_train, y_train, h)
    
    return bayesian_model, parzen_model

In [14]:
def get_sum_classifier_hit_rate(bayesian_model, parzen_model, X_test, y_test):
    posteriori_bayesian = []
    
    for i in range(X_test.shape[0]):
        posterior_x = bayesian_model.pred(X_test[i])
        dict_posterior_x = {}
        
        for j in range(len(posterior_x)):
            dict_posterior_x[j] = np.array([posterior_x[j]])
            
        posteriori_bayesian.append(dict_posterior_x)
    
    _, posteriori_parzen = predict(parzen_model, X_test, 1. / 7.)
    
    hit = 0.
    
    sums_bayesian_parzen = []
    y_predicted = []
    
    for j in range(len(posteriori_bayesian)):
        posteriori_bayesian_j = [float(np.reshape(v, ())) for v in posteriori_bayesian[j].values()]
        posteriori_parzen_j = [float(np.reshape(v, ())) for v in posteriori_parzen[j].values()]
        
        sums_bayesian_parzen.append([sum(v) for v in zip(posteriori_bayesian_j, posteriori_parzen_j)])
    
    y_predicted = [np.argmax(s) for s in sums_bayesian_parzen]
    
    return precision(y_test, y_predicted)

### Treinamento e teste

In [15]:
output_1 = []
output_2 = []
output_3 = []

for i in range(30):
    # outputs by iteration
    iter_output_1 = []
    iter_output_2 = []
    iter_output_3 = []
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)

    for train_index, test_index in skf.split(X, y_numbers):
        # shape view
        view_1 = pd.DataFrame(X).iloc[:, :6].values

        # RGB view
        view_2 = pd.DataFrame(X).iloc[:, 6:].values

        # complete view
        view_3 = X
        
        bayesian_model_1, parzen_model_1 = train_classifiers(view_1[train_index], y_numbers[train_index])
        hit_rate_1 = get_sum_classifier_hit_rate(bayesian_model_1, parzen_model_1, \
                                                 view_1[test_index], y_numbers[test_index])
        
        iter_output_1.append(hit_rate_1)
        
        bayesian_model_2, parzen_model_2 = train_classifiers(view_2[train_index], y_numbers[train_index])
        hit_rate_2 = get_sum_classifier_hit_rate(bayesian_model_2, parzen_model_2, \
                                                 view_2[test_index], y_numbers[test_index])
        
        iter_output_2.append(hit_rate_2)
        
        bayesian_model_3, parzen_model_3 = train_classifiers(view_3[train_index], y_numbers[train_index])
        hit_rate_3 = get_sum_classifier_hit_rate(bayesian_model_3, parzen_model_3, \
                                                 view_3[test_index], y_numbers[test_index])
        
        iter_output_3.append(hit_rate_3)

    output_1.append(iter_output_1)
    output_2.append(iter_output_2)
    output_3.append(iter_output_3)

with open('output/sum_v1', 'w') as output_file_1:
    wr = csv.writer(output_file_1, quoting=csv.QUOTE_NONE)
    for row in output_1:
        wr.writerow(row)

with open('output/sum_v2', 'w') as output_file_2:
    wr = csv.writer(output_file_2, quoting=csv.QUOTE_NONE)
    for row in output_2:
        wr.writerow(row)

with open('output/sum_v3', 'w') as output_file_3:
    wr = csv.writer(output_file_3, quoting=csv.QUOTE_NONE)
    for row in output_3:
        wr.writerow(row)

  # Remove the CWD from sys.path while we load stuff.
  return umr_maximum(a, axis, None, out, keepdims)


KeyboardInterrupt: 