In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import math
import scipy.stats as st
import random
import itertools
import datetime
import time
import statsmodels.api as sm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import cluster, metrics, manifold, decomposition, preprocessing
from sklearn import manifold, decomposition
from sklearn.cluster import KMeans, MiniBatchKMeans
from scipy.optimize import linear_sum_assignment
from matplotlib.collections import LineCollection
from matplotlib import colors
from itertools import chain
from matplotlib.lines import Line2D

from PIL import Image, ImageOps 
from skimage.feature import hog, ORB, match_descriptors

In [14]:
def best_perplexity(ref, X, model, perp_min = 5, perp_max = 105, step = 5, n_clusters = 7, long = 3.5, larg = 2):
    
    ARI = []
    ARI_tuned = []
    liste_perp = np.arange(perp_min, perp_max, step).tolist()
    
    for i in range (perp_min, perp_max, step):
        tsne = manifold.TSNE(n_components=2, perplexity = i, init='random', 
                             learning_rate = 200, n_iter = 5000, random_state = 0)
        
        X_tsne = tsne.fit_transform(X)
                
        k_means = KMeans(n_clusters = n_clusters, init = "k-means++", random_state = 0).fit(X_tsne)
                
        ARI.append(np.round(metrics.adjusted_rand_score(ref, k_means.labels_),5))
    
    plt.figure(figsize=(long, larg))
    
    plt.plot(liste_perp, ARI, marker = "o", color = "silver")
        
    plt.xlabel("Perplexité", fontsize = 10)
    plt.ylabel("Score ARI", fontsize = 10)
        
    plt.xticks(fontsize = 9)
    plt.yticks(fontsize = 9)
        
    plt.title(f"Distribution du score ARI selon \nla perplexité ({model})", fontsize = 12)    
       
    best_p = liste_perp[ARI.index(max(ARI))]
        
    return best_p

In [9]:
def score_ARI(model, perp, X, ref, n_clusters = 7):
    
    tsne = manifold.TSNE(n_components=2, perplexity = perp, init='random', 
                         learning_rate = 200, n_iter = 10000, random_state=0)
    
    X_tsne = tsne.fit_transform(X)
    
    k_means = KMeans(n_clusters = n_clusters, init = "k-means++", random_state = 0).fit(X_tsne)
    
    ARI = np.round(metrics.adjusted_rand_score(ref, k_means.labels_),5)
    
    print("=================================================================================================")
    print(f"Pour le modèle {model}, le score ARI est : {ARI}")
    print("=================================================================================================")
    
    lab = k_means.predict(X_tsne)
        
    return ARI, X_tsne, lab

In [10]:
def graph_t_sne(model, perp, X, ref, n_clusters = 7, long = 10, larg = 2):
    
    ARI, X_tsne, lab_model = score_ARI(model, perp, X, ref, n_clusters = 7)
    
    lab = [lab_model, ref]
    k = 0
        
    plt.figure(figsize=(long, larg))
        
    plt.subplot(1,2,1)
    
    sns.scatterplot(x = X_tsne[:,0], 
                    y = X_tsne[:,1],
                    hue = lab[0],
                    legend = 'full',
                    palette="deep")
    
    plt.xlabel("Variable t-SNE 1", fontsize = 10)
    plt.ylabel("Variable t-SNE 2", fontsize = 10)
            
    plt.xticks(fontsize = 9)
    plt.yticks(fontsize = 9)
    
    plt.legend(frameon = False, bbox_to_anchor=(1,1), fontsize = 9)
    
    plt.title(f"Segmentation avec {model} \n(labels = prédictions)", fontsize = 12)
                
    plt.subplot(1,2,2)
    sns.scatterplot(x = X_tsne[:,0], 
                    y = X_tsne[:,1],
                    hue = lab[1],
                    legend = 'full',
                    palette="deep")
            
    plt.xlabel("Variable t-SNE 1", fontsize = 10)
    plt.ylabel("Variable t-SNE 2", fontsize = 10)
            
    plt.xticks(fontsize = 9)
    plt.yticks(fontsize = 9)
            
    plt.legend(frameon = False, bbox_to_anchor=(1,1), fontsize = 9)
    
    plt.title(f"Segmentation avec {model} \n(labels = catégories)", fontsize = 12)
    
    plt.subplots_adjust(left=0.1,
                        bottom=0.1, 
                        right=0.9, 
                        top=0.99, 
                        wspace=0.6, 
                        hspace=0)
    
    plt.show()
    
    return ARI, lab_model

In [11]:
def confusion_matrix(ref_series, model_series):
    
    confusion_matrix = pd.crosstab(ref_series, model_series, 
                                   rownames = ['Données'], colnames = ['Modelisé'], margins=True)
    
    row_idx, col_idx = linear_sum_assignment(-confusion_matrix.iloc[:-1,:-1])
    row_idx = row_idx.tolist()
    col_idx = col_idx.tolist()
    row_idx.append(7)
    col_idx.append(7)
    
    confusion_matrix_rearranged = confusion_matrix.iloc[row_idx, col_idx]
    
    return confusion_matrix_rearranged

In [12]:
def graph_matrix(ref_series, model_series, model, long_fig = 10, haut_fig = 4):
    
   
    mat = confusion_matrix(ref_series, model_series)
        
    categ_x = ["Baby Care", "Beauty", "Computers", "Home Decor",
               "Home Furnishing", "Kitchen Dining", "Watches"]
    
    categ_y = ["Baby\nCare", "Beauty", "Computers", "Home\nDecor",
               "Home\nFurnishing", "Kitchen\nDining", "Watches"]
    
    plt.figure(figsize=(long_fig, haut_fig))
        
    liste_col_conf = mat.columns
    for col in liste_col_conf[:-1]:
        mat[col] = mat[col]*100/mat["All"]
        

    cmap = plt.cm.get_cmap('Blues', 10)
    boundaries = np.arange(0,110,10)
    norm = colors.BoundaryNorm(boundaries, cmap.N, clip=True)
        
    plt.matshow(mat.iloc[:-1,:-1],cmap = cmap, fignum = False , norm = norm)
        
    for (x, y), value in np.ndenumerate(mat.iloc[:-1,:-1].T):
        plt.text(x, y, f"{value:.1f}", va="center", ha="center", fontsize = 9.5, 
                 bbox=dict(boxstyle='round', facecolor='white', edgecolor='1'))
                    
           
    plt.xticks(range(len(categ_x)), categ_x, fontsize=10, rotation=45)
    plt.tick_params(axis="x", bottom=True, top=False, labelbottom=True, labeltop=False)
    plt.yticks(range(len(categ_y)), categ_y, fontsize=10)
        
    plt.xlabel("Données", fontsize=14, labelpad=10)
    plt.ylabel("Modélisation",fontsize=14, labelpad=10)
        
    plt.colorbar(fraction=0.046, pad=0.02).ax.tick_params(labelsize=10)
       
    plt.title(f"Matrice de confusion\n({model})", fontsize=14)
        
    plt.show()