### MACHINE LEARNING TECHNIQUES
------------------------------------------------------------ 

In [11]:
import numpy as np
import math

import sklearn
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from sklearn.metrics import pairwise_distances
#from sklearn.metrics import confusion_matrix

from scipy.cluster.hierarchy import dendrogram 
from scipy.cluster.hierarchy import linkage 
from scipy.cluster.hierarchy import set_link_color_palette
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import cophenet, centroid
from scipy.spatial.distance import pdist
import scipy

from pylab import rcParams
import matplotlib.pyplot as plt
np.set_printoptions(precision=4, suppress=True)
plt.figure(figsize=(10,3))
plt.style.use('seaborn-whitegrid')

#%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree

### Hierarchical Clustering

In [None]:
# Function to select the distance metric
# input: data
# output: distance matrix
#------------------------------------------------------------ 
def distances_metric(data, metric='euclidean'):
    if metric == 'euclidean':
        distance_matrix = np.zeros((data.shape[0], data.shape[0]))
        for i in range(data.shape[0]):
            for j in range(i+1, data.shape[0]):
                dist = np.linalg.norm(data[i] - data[j])
                distance_matrix[i, j] = dist
                distance_matrix[j, i] = dist
            
    elif metric == 'manhattan':
        distance_matrix = np.zeros((data.shape[0], data.shape[0]))
        for i in range(data.shape[0]):
            for j in range(i+1, data.shape[0]):
                dist = np.sum(np.abs(data[i] - data[j]))
                distance_matrix[i, j] = dist
                distance_matrix[j, i] = dist
    else:
        print("<< Invalid Metric >>")
    
    return distance_matrix


# Function to define the linkage function
# input: distance matrix, and two clusters
# output: clusters distance
#------------------------------------------------------------ 
def single_linkage(distance_matrix, cluster1, cluster2):
    return np.min(distance_matrix[np.ix_(cluster1, cluster2)])

def complete_linkage(distance_matrix, cluster1, cluster2):
    return np.max(distance_matrix[np.ix_(cluster1, cluster2)])

def average_linkage(distance_matrix, cluster1, cluster2):
    return np.mean(distance_matrix[np.ix_(cluster1, cluster2)])


# Function to group the data
# input: data, metric and method that the user defined, another variables to control
# output: grouped data
#------------------------------------------------------------ 
def agrupador(df, features_cols, start_stud, output_cluster, num_clusters, thisMetric, thisMethod):
 
    # Initial Settings
    #------------------------------------------------------------     
    dfx = df[df.iloc[:].sum(axis=1) == 0] 
    df = df[df.iloc[:].sum(axis=1) > 0]
    
    df.index = df.index + start_stud
    dfx.index = dfx.index + start_stud
    df.index.name = 'stud.'
    
    n = df.shape[0]
    clusters = [{i} for i in range(df)]
    
    # Calculate distances metric
    #------------------------------------------------------------     
    distance_matrix = distances_metric(df, thisMetric)
    
    # Define linkage function
    #------------------------------------------------------------     
    if thisMethod == 'single':
        linkage = single_linkage
    elif thisMethod == 'complete':
        linkage = complete_linkage
    elif thisMethod == 'average':
        linkage = average_linkage
    else:
        print("<< Invalid Method >>")
    
    # Merge clusters
    #------------------------------------------------------------     
    while len(clusters) > 1:
        # Find two closest clusters
        min_dist = np.inf
        for i in range(len(clusters)):
            for j in range(i+1, len(clusters)):
                dist = linkage(distance_matrix, clusters[i], clusters[j])
                if dist < min_dist:
                    min_dist = dist
                    merge_idx = (i, j)
        
        # Merge the two closest clusters
        clusters[merge_idx[0]].update(clusters[merge_idx[1]])
        del clusters[merge_idx[1]]
        
        # Update distance matrix
        for i in range(len(clusters)):
            if i == merge_idx[0]:
                continue
            dist = linkage(distance_matrix, clusters[merge_idx[0]], clusters[i])
            distance_matrix[merge_idx[0], i] = dist
            distance_matrix[i, merge_idx[0]] = dist
    
    ## Dendrogram Settings
    #------------------------------------------------------------
    # define dendrogram cut line
    cutting_height = len(df) - num_clusters
    
    labelList = df.index.tolist()
    
    c, coph_dists = cophenet(clusters, pdist(df))
    cut = clusters[cutting_height][2]
    
    plt.figure(figsize=(10, 7))
    plt.grid(False)
    if(thisMetric == 'euclidean'):
        plt.ylabel('Euclidean Distance')
    elif(thisMetric == 'manhattan'):
        plt.ylabel('Manhattan Distance')
    else:
        print("<< Metric not specified >>")
        
    plt.xlabel('Student clustering')
    
    set_link_color_palette(["blue", "orange", "green"])    
    dendrogram(clusters, orientation='top', labels=list(labelList), distance_sort='descending', 
               show_leaf_counts=True, color_threshold=cut, above_threshold_color='yellow',
               leaf_font_size=12, leaf_rotation=90)
    plt.axhline(y=cut, linestyle='--', color='r')
    plt.savefig(output_cluster, bbox_inches='tight') 
    
    print("\n>>> AGGLOMERATIVE HIERARCHICAL ({}, {})".format(thisMetric, thisMethod))
    print("Cophenetic Correlation Coefficient: {} \n{}".format(round(c,2), cluster.labels_))
    
    
    #------------------------------------------------------------
    if not 'Cluster' in df:
        df.insert(len(features_cols), 'Cluster', clusters.labels_)        
        
    #------------------------------------------------------------    
    # Está sendo utilizada a média do grupo para não prejudicar os grupos menores
    dffdp = (df.groupby(['Cluster']).mean().sum(axis=1)).sort_values(ascending=False) 

    a,b,c,d = dffdp.index.tolist()
    
    df.loc[df['Cluster'] == a, "Cluster"] = 'GA'
    df.loc[df['Cluster'] == b, "Cluster"] = 'GB'
    df.loc[df['Cluster'] == c, "Cluster"] = 'GC'    
    df.loc[df['Cluster'] == d, "Cluster"] = 'GD'
    #------------------------------------------------------------
    return (df, dfx, round(c,2))
    

In [None]:
## OUTPUT: DF acrescenta uma coluna com os Cluster (GA, GB, GC...)
#------------------------------------------------------------
def agrupador2(df, features_cols, start_stud, output_cluster, num_clusters, thisMetric, thisMethod):
    dfx = df[df.iloc[:].sum(axis=1) == 0] 
    df = df[df.iloc[:].sum(axis=1) > 0]
    
    df.index = df.index + start_stud
    dfx.index = dfx.index + start_stud
    df.index.name = 'stud.'
    
    #define dendrogram cut line
    cutting_height = len(df) - num_clusters
    
    ## Agglomerative Hierarchical
    #------------------------------------------------------------
    #create the cluster.labels_
    cluster = AgglomerativeClustering(n_clusters=num_clusters, affinity=thisMetric, linkage=thisMethod, compute_distances=True)
    cluster.fit_predict(df) 
    
    Z = linkage(df, method=thisMethod, metric=thisMetric) 
    #labelList = range(start_stud, len(df)+start_stud) 
    labelList = df.index.tolist()
    
    c, coph_dists = cophenet(Z, pdist(df))
    cut = Z[cutting_height][2]
    
    ## Dendrogram
    #------------------------------------------------------------ 
    plt.figure(figsize=(10, 7)) #6.5, 2.5 
    plt.grid(False)
    plt.ylabel('Euclidean Distance')
    plt.xlabel('Student clustering')
    
    set_link_color_palette(["blue", "orange", "green"])    
    dendrogram(Z, orientation='top', labels=list(labelList), distance_sort='descending', 
               show_leaf_counts=True, color_threshold=cut, above_threshold_color='yellow',
               leaf_font_size=12, leaf_rotation=90)
    plt.axhline(y=cut, linestyle='--', color='r')
    plt.savefig(output_cluster, bbox_inches='tight') 
    
    print("\n>>> AGGLOMERATIVE HIERARCHICAL ({}, {})".format(thisMetric, thisMethod))
    print("Cophenetic Correlation Coefficient: {} \n{}".format(round(c,2), cluster.labels_))
    
    
    #------------------------------------------------------------
    if not 'Cluster' in df:
        df.insert(len(features_cols), 'Cluster', cluster.labels_)        
        
    #------------------------------------------------------------    
    # Está sendo utilizada a média do grupo para não prejudicar os grupos menores
    dffdp = (df.groupby(['Cluster']).mean().sum(axis=1)).sort_values(ascending=False) 

    a,b,c,d = dffdp.index.tolist()
    
    df.loc[df['Cluster'] == a, "Cluster"] = 'GA'
    df.loc[df['Cluster'] == b, "Cluster"] = 'GB'
    df.loc[df['Cluster'] == c, "Cluster"] = 'GC'    
    df.loc[df['Cluster'] == d, "Cluster"] = 'GD'
    #------------------------------------------------------------
    return (df, dfx, round(c,2))


### Decision Tree

In [None]:
import pydotplus

def classificador(df, features_cols, output_tree, printador):
    clf = DecisionTreeClassifier(max_depth=None, random_state=0, criterion='entropy')
    clf.fit(df[features_cols], df['Cluster'])

    cn = ['GA', 'GB', 'GC', 'GD']
    
    dot_data = tree.export_graphviz(clf, feature_names = features_cols, class_names = cn, out_file=None,
                                    filled = True, rounded = False, special_characters=True)
    
    graph = pydotplus.graph_from_dot_data(dot_data)
    nodes = graph.get_node_list()
    colors =  ('yellow', 'orange', 'green', 'lightblue', 'pink')
    
    for node in nodes:
        if node.get_name() not in ('node', 'edge'):
            values = clf.tree_.value[int(node.get_name())][0]
            #color only nodes where only one class is present
            if max(values) == sum(values):    
                node.set_fillcolor(colors[np.argmax(values)])
            #mixed nodes get the default color
            else:
                #node.set_fillcolor(colors[np.argmax(values)])
                node.set_fillcolor("white")
                
    graph.write_png(output_tree)
    
    #?print("Accuracy: {}".format(round(score, 3)))