### MACHINE LEARNING TECHNIQUES
------------------------------------------------------------ 

In [11]:
import numpy as np
import math

import sklearn
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from sklearn.metrics import pairwise_distances
#from sklearn.metrics import confusion_matrix

from scipy.cluster.hierarchy import dendrogram 
from scipy.cluster.hierarchy import linkage 
from scipy.cluster.hierarchy import set_link_color_palette
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
import scipy

from pylab import rcParams
import matplotlib.pyplot as plt
np.set_printoptions(precision=4, suppress=True)
plt.figure(figsize=(10,3))
plt.style.use('seaborn-whitegrid')

#%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree

### SELETOR

In [2]:
 def seletor(df, features_cols):
    features_cols_exist = []
    for i, cmd in zip(range(len(df[features_cols].keys())), df[features_cols].keys()):    
        #len(df.loc[df['typedef'] == 0]) 
        if not len(df.loc[df[cmd] == 0]) == len(df[features_cols].keys()):
            print("{}: {} == {}".format(cmd, len(df.loc[df[cmd] == 0]), len(df[features_cols].keys())))
            features_cols_exist.append(cmd)
    return features_cols_exist

### NORMALIZADOR (L2)

In [3]:
def normalizador2(ls_dataset):
	def l2_normalize(v):
		norm = np.sqrt(np.sum(np.square(v)))
		#print("v: {}, norm: {}".format(v, norm))
		#print("v: {}, norm: {}, DIV: {}".format(v, norm, v/norm))		
		result = v / norm
		#try:

		for r in result:
			if math.isnan(r):
				return v

		return result
		#except (IndexError, ZeroDivisionError):

	norm_obj = []
	for objeto in ls_dataset:
		norm_obj.append(l2_normalize(objeto)) 
	return norm_obj

In [20]:
def normalizador(v):
    norm = np.sqrt(np.sum(np.square(v)))
    result = v / norm
    return result

### AGRUPADOR (KMEANS)

In [1]:
## OUTPUT: DF acrescenta uma coluna com as labels (GA, GB, GC...)
#------------------------------------------------------------
def agrupador(df, features_cols, start_stud, output_cluster, num_clusters):
    rotulos = []
    cutting_height = len(df) - num_clusters
    
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(df[features_cols].values)
    silhouette = metrics.silhouette_score(df, kmeans.labels_, metric='euclidean')
    print("\n>>> K-MEANS \nSilhouette: {} \n{}".format(round(silhouette,2), kmeans.labels_))

    ## Agglomerative Hierarchical
    #------------------------------------------------------------
    cluster = AgglomerativeClustering(n_clusters=num_clusters, affinity='euclidean', linkage='complete', compute_distances=True)
    cluster.fit_predict(df)
    
    A = linkage(df, method='centroid', metric='euclidean')
    Z = linkage(df, method='complete', metric='euclidean') 
    labelList = range(start_stud, len(df)+start_stud)
    c, coph_dists = cophenet(Z, pdist(df))
    cut = Z[cutting_height][2]
    
    ## Dendrogram
    #------------------------------------------------------------ 
    plt.figure(figsize=(10, 7)) #6.5, 2.5 
    plt.grid(False)
    plt.ylabel('Euclidean Distance')
    plt.xlabel('Student clustering')
    
    #set_link_color_palette(["#477f30", "#0000FF", "#993a8b"])
    set_link_color_palette(["blue", "green", "orange"])
    dendrogram(Z, orientation='top', labels=list(labelList), distance_sort='descending', 
               show_leaf_counts=True, color_threshold=cut, above_threshold_color='yellow',
               leaf_font_size=12, leaf_rotation=90)
    plt.axhline(y=cut, linestyle='--', color='r')
    plt.savefig(output_cluster, bbox_inches='tight') 
    
    print("\n>>> AGGLOMERATIVE HIERARCHICAL")
    print("Cophenetic Correlation Coefficient: {} \n{}".format(round(c,2), cluster.labels_))
    
    #------------------------------------------------------------
    if not 'Label' in df:
        df.insert(len(features_cols), 'Label', cluster.labels_)
        
    #------------------------------------------------------------    
    # Está sendo utilizada a média do grupo para não prejudicar os grupos menores
    dffdp = (df.groupby(['Label']).mean().sum(axis=1)).sort_values(ascending=False) 

    a,b,c,d = dffdp.index.tolist()
    
    #------------------------------------------------------------
    for i, lab in enumerate(df['Label']):
        if lab == a:
            df.loc[[i],['Label']] = 'GA'
            rotulos.append('GA')
        elif lab == b:
            df.loc[[i],['Label']] = 'GB'
            rotulos.append('GB')
        elif lab == c:
            df.loc[[i],['Label']] = 'GC'
            rotulos.append('GC')
        elif lab == d:
            df.loc[[i],['Label']] = 'GD'
            rotulos.append('GD')

    #------------------------------------------------------------
    return (df, rotulos)

### CLASSIFICADOR (ID3)

In [None]:
import pydotplus

def classificador(df, features_cols, output_tree, printador):
    #?X_train, X_test, Y_train, Y_test = train_test_split(df[features_cols], df['Label'], test_size=0.3, random_state=1) # 70% training and 30% test
    clf = DecisionTreeClassifier(max_depth=None, random_state=0, criterion='entropy')
    clf.fit(df[features_cols], df['Label'])
    
    #?y_pred = clf.predict(X_test)
    #?score = clf.score(X_test, Y_test)

    cn = ['GA', 'GB', 'GC', 'GD']
    
    dot_data = tree.export_graphviz(clf, feature_names = features_cols, class_names = cn, out_file=None,
                                    filled = True, rounded = False, special_characters=True)
    
    graph = pydotplus.graph_from_dot_data(dot_data)
    nodes = graph.get_node_list()
    colors =  ('yellow', 'orange', 'green', 'lightblue', 'pink')
    
    for node in nodes:
        if node.get_name() not in ('node', 'edge'):
            values = clf.tree_.value[int(node.get_name())][0]
            #color only nodes where only one class is present
            if max(values) == sum(values):    
                node.set_fillcolor(colors[np.argmax(values)])
            #mixed nodes get the default color
            else:
                #node.set_fillcolor(colors[np.argmax(values)])
                node.set_fillcolor("white")
                
    graph.write_png(output_tree)
    
    #?print("Accuracy: {}".format(round(score, 3)))

    if printador:
        print("Número de nós: {}".format(clf.tree_.node_count))
        #print("x-train: {} y-test: {}".format(X_train.shape, Y_test.shape))
        #print("\nMatriz de confusão: \n{}".format(metrics.confusion_matrix(Y_test, y_pred)))
        print("\nImportância das características: \n{}".format(clf.feature_importances_))
        print("\nOrdem das características: \n{}".format(df[features_cols].keys()))
        #print("\nRelatório: \n{}".format(metrics.classification_report(Y_test, y_pred)))
        #print("X_train, Y_train, X_test, Y_test \nfeatures: {}\n classes: {}\n dados teste: {}\n classes teste: {}".format(X_train, Y_train, X_test, Y_test))
        #print("classes: \n{}\n classes teste: \n{}".format(Y_train, Y_test))    