In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%load_ext autotime
import os
import sys
sys.path.insert(0, os.path.abspath('../src/'))
from difflib import SequenceMatcher
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AffinityPropagation
import distance
import random
from itertools import repeat 
from sklearn.cluster import DBSCAN
from numpy import nan
import jellyfish
from scipy.spatial import distance
import utils
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer



In [None]:
import numpy as np
from scipy.sparse import linalg, eye, csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import pairwise_distances
from collections import defaultdict



class MarkovClustering:
    def __init__(self, matrix, metric="cosine", bias=1):
        """
        Initializing similarity matrix
        Either by setting it (metric = None)
        Or by deriving it from a distance function (similarity = bias-distance)
        """
        self.labels_ = None
        if metric is None:
            self.T = matrix
        else:
            self.T = csr_matrix(bias-pairwise_distances(matrix, metric=metric))

    def normalize(self):
        self.T = normalize(self.T, norm='l1', axis=1)

    def self_loops(self, weight=0.01):
        self.T = eye(self.T.shape[0]) * weight + self.T

    def expansion(self, p=2):
        ret = self.T
        for _ in range(1,p):
            ret = ret * self.T
        self.T = ret

    def inflation(self, p=2, th=1e-10):
        for i in range(len(self.T.data)):
            if self.T.data[i] < th:
                self.T.data[i] = 0
            else:
                self.T.data[i] = self.T.data[i] ** p

    def fit(self, inflation_power=2, inflation_threshold=1e-10, self_loops_weight=0.01, expansion_power=2, iteration_limit=100,verbose=False):
        iterations = 0
        prev_T = csr_matrix(self.T.shape)
        self.self_loops(self_loops_weight)
        while (iterations < iteration_limit) and ((prev_T - self.T).nnz != 0):
            prev_T = self.T
            iterations += 1
            self.normalize()
            self.expansion(expansion_power)
            self.inflation(inflation_power, inflation_threshold)
            if verbose:
                print ("========Iteration #{i}=======".format(i=iterations))
                print(self.T.toarray())
        self.labels_ = self.extract_labels()
        return self

    def extract_labels(self):
        M = self.T.tocoo()
        rows = defaultdict(set)
        for i, d in enumerate(M.data):
            if d == 0:
                continue
            rows[M.row[i]].add(M.col[i])
        hash_row = lambda l: ",".join(map(str,sorted(l)))
        row_hashes = [hash_row(rows[i]) for i in range(M.shape[0])]
        d = dict([(l,i) for i,l in enumerate(set(row_hashes))])
        labels = [d[row_hashes[i]] for i in range(M.shape[0])]
        return labels

    def clusters(self, labels=None):
        ret = defaultdict(set)
        for i,c in enumerate(self.labels_):
            if labels is None:
                ret[c].add(i)
            else:
                ret[c].add(labels[i])
        return ret

In [None]:
Data_file = "~/Downloads/echantillon.csv"

df = pd.read_csv(Data_file)

In [None]:
df_names = np.asarray(df["MARQUE"])
words = np.asarray(df_names)
df_unique = np.unique(df_names)
unique_words = np.asarray(df_unique)
X = CountVectorizer(max_df=10**-2, min_df=10**-7).fit_transform(unique_words)
X = TfidfTransformer(use_idf=False).fit_transform(X)
T = MarkovClustering(X)
list_cluster = list(T.fit().clusters().items())
# list_uncorrect = []
# list_suggestion = []
index_incorrect_words = []
correct_words = []
i = 0
if len(list_cluster) == len(words):
    print("all is unique")
else:
    for cluster in list_cluster:
#         print(cluster)
        list_clust = list(cluster)[1]
        cluster_count = []
        if len(unique_words[list(list_clust)]) > 1 :
#             print(unique_words[list(list_clust)])
            _ , unique_counts = np.unique(df_names, return_counts=True)                        
            count_cluster = [unique_counts[np.where(unique_words == w)[0]][0] for w in unique_words[list(list_clust)]]
            for w in unique_words[list(list_clust)]:
                count_w = unique_counts[np.where(unique_words == w)[0]][0]
                if count_w < max(count_cluster):
                    index_incorrect_words += np.ndarray.tolist(np.where(words == w)[0])  
                    correct_words += [unique_words[list(list_clust)][np.argmax(count_cluster)]]*len(np.ndarray.tolist(np.where(words == w)[0]))



In [None]:
words[index_incorrect_words].shape

In [None]:
len(correct_words)

In [None]:
Data_file = "~/Downloads/echantillon.csv"

df = pd.read_csv(Data_file)