In [1]:
import sys

sys.path.append("/home/jupyter/detection_doublons/src") # for adding the directory "src"
from mesures import *
from custom_processing import *
from visualization import *
from custom_hierachical_clustering import *
import pickle

import copy
import re
import os

from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics import davies_bouldin_score
from sklearn.model_selection import ParameterGrid

import numpy as np
import pandas as pd

from typing import Callable, Tuple, Union
from time import time
from tqdm import tqdm

import networkx as nx
import gravis as gv
import math
import random
from google.cloud import bigquery


### Loading the data

In [2]:
original_data=pd.read_csv("/home/jupyter/detection_doublons/data/gold_fornissor2.csv")
data=original_data.copy()
# n_data=data.drop(["id", "confidence_country","confidence_city", "Latitude","Longitude"], axis=1).copy()
data.shape

(38076, 9)

In [3]:
data.columns

Index(['tiern_location_state_city', 'tiern_name', 'tiern_plant', 'id',
       'confidence_country', 'Latitude', 'Longitude', 'Country',
       'confidence_city'],
      dtype='object')

### First step processing

In [4]:
cols_to_drop =["id", "confidence_country","confidence_city", "Latitude","Longitude"]
custom_processing = Custom_Processing(drop_cols=cols_to_drop, dataframe=data)
custom_processing.fit_transform()

100%|██████████| 1/1 [00:00<00:00, 23.84it/s]
100%|██████████| 1/1 [00:00<00:00, 11.01it/s]


In [5]:
data.shape

(38076, 9)

In [6]:
custom_processing.duplicates_dataframe.shape[0]

6723

In [7]:
numbers1 = custom_processing.cont_figure_dataframe
numbers1["suggested_name"] = numbers1['tiern_name_preprocessed']
numbers1.loc[numbers1.duplicated(['suggested_name', 'tiern_location_state_city', 'Country']) == True, 'category'] = 'duplicates'

duplication1 = custom_processing.duplicates_dataframe
duplication1 = duplication1.rename(columns = {"tiern_name_preprocessed": "suggested_name"})

special1 = custom_processing.special_char_dataframe
# special1["suggested_name"] = special1["tiern_name_preprocessed"]

others1 = custom_processing.others_dataframe
others1 = others1.rename(columns = {"tiern_name_preprocessed": "suggested_name"})
others1.loc[others1.duplicated(['suggested_name', 'tiern_location_state_city', 'Country']) == True, 'category'] = 'duplicates'

processed1 = custom_processing.dataframe
# processed1["suggested_name"] = processed1["tiern_name_preprocessed"]

#### Cheicking
Cheick if no data point are lose during processing

In [8]:
assert numbers1.shape[0]  + duplication1.shape[0] + special1.shape[0] + others1.shape[0] + processed1.shape[0] == data.shape[0], "Some data points have been losen during processing"

### Second step processing
#### Translation 
special characters dataset contain tiern name with different language which are different from english.<br>
To use this part of dta in the raining process, we translate those language into frecnh and then  concatenate it with the previous processed dataset which 
do not contain duplicates. 

In [9]:
os.environ['http_proxy'] = 'http://iac-proxy.cnz.renault.gcp:80'
os.environ['https_proxy'] = 'http://iac-proxy.cnz.renault.gcp:80'
os.environ['HTTP_PROXY'] = 'http://iac-proxy.cnz.renault.gcp:80'
os.environ['HTTPS_PROXY'] = 'http://iac-proxy.cnz.renault.gcp:80'

In [10]:
special1 = special1.drop(["tiern_name_preprocessed"], axis=1)
special1 = translation(special1, special_char_cols=['tiern_name', 'tiern_plant'])
special1.shape

100%|██████████| 2/2 [00:35<00:00, 17.88s/it]


(331, 4)

#### Processing2

In [11]:
custom_processing1 = Custom_Processing(dataframe=special1)
custom_processing1.fit_transform()

100%|██████████| 1/1 [00:00<00:00, 429.48it/s]
100%|██████████| 1/1 [00:00<00:00, 293.70it/s]


#### cheicking

In [12]:
numbers2 = custom_processing1.cont_figure_dataframe
numbers2.loc[numbers2.duplicated(['suggested_name', 'tiern_location_state_city', 'Country']) == True, 'category'] = 'duplicates'

duplication2 = custom_processing1.duplicates_dataframe
duplication2 = duplication2.rename(columns = {"tiern_name_preprocessed": "suggested_name"})

special2 = custom_processing1.special_char_dataframe
special2["suggested_name"] = special2["tiern_name_preprocessed"]
special2["category"] = 'other'
special2.loc[special2.duplicated(['suggested_name', 'tiern_location_state_city', 'Country']) == True, 'category'] = 'duplicates'

others2 = custom_processing1.others_dataframe
others2 = others2.rename(columns = {"tiern_name_preprocessed": "suggested_name"})
others2.loc[others2.duplicated(['suggested_name', 'tiern_location_state_city', 'Country']) == True, 'category'] = 'duplicates'

processed2 = custom_processing1.dataframe
# processed2["suggested_name"] = processed2["tiern_name_preprocessed"]

In [13]:
assert numbers2.shape[0]  + duplication2.shape[0] + special2.shape[0] + others2.shape[0] + processed2.shape[0] == special1.shape[0],"Some data points have been losen during processing"

#### Defining the référence dataset
This is the datframe on which some occurence of tiern name of processed data will be seeked

In [14]:
ref = pd.concat((custom_processing.ref_dataframe, custom_processing1.ref_dataframe), axis=0)
ref.shape

(36688, 6)

#### Train data
Forming the training dataset. It consist of the first processed dataset and the second one (after translating it to the correct langue and processing it too)

In [15]:
train_data = pd.concat((processed1, processed2))
train_data.shape

(27746, 6)

There can be some duplication whithin the training dataset after concatenation, beacause after translation there can be some data point present in the both parts.<br>
So those duplication should be drop to have a clean train dataset

In [16]:
duplication3 = train_data.drop(index= train_data.drop_duplicates(['tiern_location_state_city', 'Country', 'tiern_name_preprocessed']).index)
train_data.drop_duplicates(['tiern_location_state_city', 'Country', 'tiern_name_preprocessed'], inplace=True)
duplication3["category"]="duplicates"
train_data.shape

(27736, 6)

In [17]:
duplication3

Unnamed: 0,tiern_location_state_city,tiern_name,tiern_plant,Country,tiern_name_preprocessed,chars,category
58,hubei,baowu steel,wuhan factory,china,wuhan factory baowu steel,wuhanfactorybaowusteel,duplicates
65,hubei,shiyan stands tall,shiyan stands tall,china,shiyan stands tall,shiyanstandstall,duplicates
73,hubei,anhui kairui,shiyan factory,china,shiyan factory anhui kairui,shiyanfactoryanhuikairui,duplicates
154,wuhan,wuhan blonde,wuhan blonde,china,blonde,blonde,duplicates
1814,shanghai,baosteel,baosteel,china,baosteel,baosteel,duplicates
20142,moscow,npp itelma,npp itelma,russia,npp itelma,nppitelma,duplicates
20449,magnitogorsk,mmk,mmk,russia,mmk,mmk,duplicates
12156,guangrong,bocan bocan,bocan bocan,china,bocan,bocan,duplicates
20443,magnitogorsk,techinveststroy mmk,mmk,russia,mmk techinveststroy,mmktechinveststroy,duplicates
24440,yelabuga,mmk coskunoz,mmk coskunoz,russia,mmk coskunoz,mmkcoskunoz,duplicates


#### Training
During training process somme duplicates can be detect and there can also be some clusters which contain only a single data point. <br>
For the purpose of suggesting the correct name within a cluster, all single data point whithin clusters are save whithin the same dataframe and the other hand all detected duplicates are save within dataframe different from the first one.

In [None]:
distance_threshold = 0.1
model_city = Agglomerative_Hierachical_Clustering(distance_threshold=distance_threshold, linkage ="average", metric = damerau_levenshtein_distance, normalize_metric=True)
path1=f"/home/jupyter/detection_doublons/data/final2/tatal_{str(distance_threshold).replace('.', '_')}_doublons_.csv"
path2=f"/home/jupyter/detection_doublons/data/final2/tatal_{str(distance_threshold)  .replace('.', '_')}_singles_.csv"
# #model_city.fit_per_city(traain_data.groupby(["Country", "tiern_location_state_city"]).filter(lambda x : len(x) <=4 and len(x) >2), path1=path1, path2= path2
model_city.fit_per_city(train_data, path1=path1, path2= path2)



  0%|          | 0/2753 [00:00<?, ?it/s][A
  1%|          | 32/2753 [00:00<00:48, 56.63it/s][A
  2%|▏         | 64/2753 [00:01<01:16, 35.28it/s][A
  3%|▎         | 96/2753 [00:03<01:37, 27.36it/s][A
  5%|▍         | 128/2753 [00:04<01:27, 30.11it/s][A
  6%|▌         | 160/2753 [00:05<01:38, 26.27it/s][A
  7%|▋         | 192/2753 [00:07<01:43, 24.84it/s][A
  8%|▊         | 224/2753 [00:08<01:44, 24.18it/s][A
  9%|▉         | 256/2753 [00:09<01:45, 23.64it/s][A
 10%|█         | 288/2753 [00:11<01:46, 23.10it/s][A
 12%|█▏        | 320/2753 [00:12<01:40, 24.28it/s][A
 13%|█▎        | 352/2753 [00:13<01:39, 24.05it/s][A
 14%|█▍        | 384/2753 [00:15<01:38, 24.16it/s][A
 15%|█▌        | 416/2753 [00:16<01:34, 24.84it/s][A
 16%|█▋        | 448/2753 [00:17<01:31, 25.16it/s][A
 17%|█▋        | 480/2753 [00:18<01:31, 24.92it/s][A
 19%|█▊        | 512/2753 [00:20<01:29, 24.93it/s][A
 20%|█▉        | 544/2753 [00:21<01:27, 25.30it/s][A
 21%|██        | 576/2753 [00:22<01:27, 

In [None]:
dataframe = pd.read_csv(path1)
single_dataframe = pd.read_csv(path2)
dataframe1 = dataframe.drop_duplicates(['country', 'city', 'duplication_id'])

dataframe["suggested_name"] = dataframe["tiern_name_preprocessed"]
dataframe.shape

In [None]:
np.unique(grouped['counts'])

In [None]:
# dataframe.groupby(['duplication_id'])
grouped = dataframe.groupby(['country','city','duplication_id']).size().reset_index(name='counts')
max_count = grouped['counts'].max()  
max_groups = grouped[grouped['counts'].isin([3, 4, 5])]
result = dataframe[dataframe['duplication_id'].isin(max_groups['duplication_id'])]
result.tail(10)

In [None]:
assert single_dataframe.shape[0] + dataframe.shape[0] == train_data.shape[0], f"single_dataframe.shape[0] + dataframe.shape[0] = {single_dataframe.shape[0] + dataframe.shape[0]} is differente from train_data.shape[0] = {train_data.shape[0]}, whiche is abnormal" 

#### Processing the dupliction dataframe
##### Occurrence determination


In [None]:
inter_dataframe = occurrences(ref_dataframe = ref, processing_dataframe = result, col ='tiern_name_preprocessed')

In [None]:
inter_dataframe.tail(10)

In [None]:
ref.columns

##### Making suggestion 

In [None]:
suggested_dataframe = inter_dataframe.groupby("duplication_id").apply(suggest_group_name,tiern_name='tiern_name_preprocessed').reset_index(drop=True)

In [None]:
suggested_dataframe[suggested_dataframe['occurrence'] > 1].tail(10)

In [None]:
suggested_dataframe.tail(10)

In [None]:

assert inter_dataframe.shape[0] == dataframe.shape[0], f"The occurence dataframe must have the same size as the duplication dataframe, but one have inter_dataframe.shape[0] = {inter_dataframe.shape[0]} and  dataframe.shape[0] = { dataframe.shape[0]}"
assert suggested_dataframe.shape[0] == dataframe.shape[0], f"The suggested dataframe must have the same size as the duplication dataframe, but one have suggested_dataframe.shape[0] = {suggested_dataframe.shape[0]} and  dataframe.shape[0] = { dataframe.shape[0]}"

### final processing

In [None]:
final_dataframe = suggested_dataframe.drop(["duplication_id", "occurrence"], axis=1)
final_dataframe["category"] = "processed"
final_dataframe

In [None]:
assert final_dataframe.shape[0] == dataframe.shape[0], f"final_dataframe must have the same size as the duplication dataframe, but one have final_dataframe.shape[0] = {final_dataframe.shape[0]} and  dataframe.shape[0] = { dataframe.shape[0]}"

In [None]:
processed_dataframe4 = final_dataframe.drop_duplicates(["country", "city", "suggested_name"])
duplication4 = final_dataframe.drop(index=processed_dataframe4.index, axis=0)
duplication4["category"] ="duplicates"

single_dataframe = pd.read_csv(path2)
single_dataframe["category"] = "processed"
duplication4

In [None]:
others = pd.concat((others1, others2, special2))
others = others.rename(columns={"Country" : 'country',"tiern_location_state_city": 'city'}) # Renaming some columns
others = others[list(single_dataframe.columns)]  # order the columns in the disired order

numbers1 = numbers1.rename( columns={"Country" : 'country',"tiern_location_state_city": 'city'})  # Renaming some columns
numbers1 = numbers1[list(single_dataframe.columns)] # order the columns in the disired order


In [None]:
duplication5 = pd.concat((numbers1.drop(index = numbers1.drop_duplicates(["country", "city", "suggested_name"]).index),
                          others.drop(index = others.drop_duplicates(["country", "city", "suggested_name"]).index)), axis=0)

numbers1.drop_duplicates(["country", "city", "suggested_name"], inplace=True)
others.drop_duplicates(["country", "city", "suggested_name"], inplace=True)
duplication5['category'] = "duplicates"
duplication5

In [None]:
duplication_dataframe = pd.concat((duplication1, duplication2, duplication3), axis=0)
duplication_dataframe.drop(['chars'], axis= 1, inplace=True)
duplication_dataframe = duplication_dataframe.rename( columns={"Country" : 'country',"tiern_location_state_city": 'city'})
duplication_dataframe = duplication_dataframe[list(single_dataframe.columns)]
duplication_dataframe = pd.concat((duplication_dataframe, duplication4,duplication5), axis = 0)

final_processed_dataframe= pd.concat((single_dataframe,processed_dataframe4, others, numbers1, duplication_dataframe), axis=0)

final_processed_dataframe

In [None]:
final_processed_dataframe['similarity_score']= round(1- distance_threshold, 1)
final_processed_dataframe.drop(["tiern_name_preprocessed"], axis=1, inplace=True)

In [None]:
final_processed_dataframe.loc[final_processed_dataframe.isnull()["category"], :].tail(20)

In [None]:
data["tiern_location_state_city"] = data["tiern_location_state_city"].replace({
    'Frankfurt (Oder)': 'frankfurt oder', 
    'Charleville-Mézières': 'Charleville Mézières'
})

data["tiern_location_state_city"] = data["tiern_location_state_city"].str.lower().str.strip()
data["Country"] = data["Country"].str.lower().str.strip()

data = data.rename(columns={
    "tiern_location_state_city" : "city",
    "Country" : "country",
    "Latitude" : "latitude",
    "Longitude" : "longitude",
    
})

In [None]:
final_processed_dataframe = final_processed_dataframe.merge(data[['country', 'city','longitude', 'latitude']].drop_duplicates(['country', 'city']), on=['country', 'city'], how='inner')
final_processed_dataframe.shape

In [None]:
final_processed_dataframe.columns

In [None]:
final_processed_dataframe.isnull().sum()

In [None]:
path=f"/home/jupyter/detection_doublons/data/final2/tatal_{str(distance_threshold).replace('.', '_')}_total.csv"
final_processed_dataframe.to_csv(path, index=False)

In [None]:
final_processed_dataframe[final_processed_dataframe['category'] == 'duplicates'].shape[0] / final_processed_dataframe.shape[0]  * 100

In [None]:
final_processed_dataframe[final_processed_dataframe['category'] == 'duplicates'].shape[0] - final_processed_dataframe.shape[0]

In [None]:
assert final_processed_dataframe.shape[0] == data.shape[0], f"final_processed_dataframe.shape[0] = {final_processed_dataframe.shape[0]} must be equal to data.shape[0] = {data.shape[0]}"
final_processed_dataframe.shape

###  Duplication percentage

In [None]:
str(round(final_processed_dataframe.loc[final_processed_dataframe["category"]=="duplicates", :].shape[0]/58005 *100,2)) + f"% of duplicates"

In [None]:
list_of_distance = [round(0.1 * i, 1) for i in range(1, 7)]
total = None 

for distance_threshold in list_of_distance:
    path=f"/home/jupyter/detection_doublons/data/final/tatal_{str(distance_threshold).replace('.', '_')}_total.csv"
    dataframe= pd.read_csv(path)
    
    total = pd.concat((total, dataframe), axis=0)

print(total.shape)
total["suggested_name"]= total["suggest"]
total["similarity_score"] = round(1 - total["distance_threshold"], 1)
total.drop(columns =["suggest","distance_threshold"], inplace=True)
total.shape[0] == 58005 * 6

In [None]:
np.unique(total['similarity_score'])