In [13]:
import pandas as pd
import glob
import os
import re
import json
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

hall = "1"

maestra = pd.read_csv("datasets\evaluacion\maestra_filtered.csv"
                    ,sep=","
                    ,dtype=str)

stage1_path     = f"datasets\evaluacion\positional_information\\4_pred_stage1_hall_{hall}.json"
stage2_path     = f"datasets\evaluacion\positional_information\\4_pred_stage2_hall_{hall}.json"
stage3_path     = f"datasets\evaluacion\positional_information\\4_pred_stage3_hall_{hall}.json"

vocabulariopath = "datasets\evaluacion\\annotations\\vocabulario.csv"

df      = pd.read_csv(f"datasets\evaluacion\positional_information\\3_eval_cluster_hall_{hall}.csv"
                    ,sep=","
                    ,dtype=str)
clusterlist = sorted(list(set(df["cluster"])))

In [14]:
# Variables cotas de lecturas
price_conf_thres        = 0.90 # se pueden reducir estos valores
description_conf_thres  = 0.75
code_conf_thres         = 0.90
CODELENGTH              = 9
COSINE_DIST_THRESH      = 0.01
dictionary_weights = {"price":0.70,"description":0.25,"code":0.05}



# confianzas de clusters
# price: 0.983256*0.70
# description: 0.885963*0.25
# code: 0.970569*0.05
# total = 0.923569
# price: 0.979868
# description: 0.905892
# code: 0.980545
# total=0.935698
# price: 0.99999
# description: 0.99999
# code: 0.0
# total=0.95


In [15]:
# filtrado de lecturas

def validateprice(price,price_conf):
    if type(price) is not str:
        return 
    if price_conf<price_conf_thres:
        return 
    if "." not in price:
        return 
    entero,decimal=price.split(".")
    # eliminando posibles textos del boxing por ser rectangular
    entero = re.sub('\D', '', entero)
    decimal = re.sub('\D', '', decimal)
    if len(decimal)>2 or len(decimal)<=0:
        return 
    if len(decimal)==1:
        # se agrega un cero al final para casos
        # observados de que se pierde este numero
        # al final.
        decimal+="0"
    return f"{entero}.{decimal}"

def validatedescription(description,description_conf):
    if type(description) is not str or description_conf<description_conf_thres:
        return
    # puede que existan valores no alphanumericos por eliminar
    return re.sub('\W+',' ',description)

def validatecode(code,code_conf):
    if type(code) is not str or not code.isnumeric: 
        return 
    elif code_conf<code_conf_thres or len(code)!=CODELENGTH:
        return
    else:
        return code

def filtercluster(cluster_data):
    new_cluster_data=[]
    for data in cluster_data:
        # validaciones de precio
        price = str(data[0][0]).replace(" ","").replace("$","")
        price_conf = float(data[1][0])
        price = validateprice(price,price_conf)
        if price is None:
            continue
        # validaciones descripcion
        description = data[0][1]
        description_conf = float(data[1][1])
        description = validatedescription(description,description_conf)
        if description is None:
            continue
        # cluster original
        origcluster=data[2]
        # validaciones item extra codigo
        code = data[0][2]
        code_conf = float(data[1][2])
        code = validatecode(code,code_conf)
        if code is None:
            code_conf = 0
        CW = price_conf*dictionary_weights["price"]+\
             description_conf*dictionary_weights["description"]+\
             code_conf*dictionary_weights["code"]
        new_cluster_data.append([origcluster,CW,[price,price_conf],[description,description_conf],[code,code_conf]])
    return new_cluster_data

# funciones para obtener y reinsertar en structura de datos principal
def get_descs(struct):
    descs=[]
    for cluster,data in struct.items():
        for origcluster,cw,priceinfo,descinfo,codeinfo in data:
            descs.append(descinfo[0])
    return descs

def insert_desc(struct,descs):
    D={}
    i=0
    for cluster, data in struct.items():
        D[cluster]=[]
        for origcluster,cw,priceinfo,descinfo,codeinfo in data:
            D[cluster].append([origcluster,cw,priceinfo,[descs[i],descinfo[1]],codeinfo])
            i+=1
    return D

# Funciones para obtener similitud
def preprocessing(lista_descripciones, stop_words=['DE','EN', 'LA', 'Y', 'POR', 'EL']):
    lista_descripciones = [re.sub('[^A-Za-z0-9]+', ' ', s) for s in lista_descripciones]
    lista_palabras = [x.split(' ') for x in lista_descripciones]
    lista_terminos = [''.join([x.replace(x,'') if x in stop_words else x for x in y]) for y in lista_palabras]
    return lista_terminos

def cosine_distance(matrix_rows, matriz_cols):
    def normalize(matrix):
        return np.apply_along_axis(lambda x: x/np.sqrt(x.dot(x)), 1, matrix)
    matrix_cos = normalize(matrix_rows).dot(normalize(matriz_cols).transpose())
    return np.ones(matrix_cos.shape) - matrix_cos

# utils
def save_json(data,path):
    with open(path,"w") as f:
        json.dump(data,f,indent=4,sort_keys=True)

def get_new_cluster(index):
    cluster = str(len(clusterlist)+index).rjust(6,"0")
    index+=1
    return cluster,index

In [16]:
# Creacion de estrucutra principal
D = {}
for cluster in sorted(clusterlist):
    data = df[df["cluster"]==cluster]
    data = filtercluster([[list(row[1][["price","description","code"]])
                                ,list(row[1][["price_conf","description_conf","code_conf"]])
                                ,row[1]["original_cluster"]] for row in data.iterrows()])
    if len(data)>0:
        D[cluster]=data
save_json(D,stage1_path)

In [17]:
# Proyeccion de descripciones a datos de maestra
# se puede agregar filtrado por similitud menor a un valor. (Modificaciones en D)
# se puede agregar filtrado por validacion de levenstin en code si existe.
vocabulario     = [ str(i) for i in list(pd.read_csv(vocabulariopath,dtype=str)['ngramas'])] #lectura nan
Mdesc           = list(maestra["DESCRIPCION"])
maestradesc     = preprocessing(Mdesc)
maestradescvec  = CountVectorizer(ngram_range=(3, 3),analyzer='char',vocabulary=vocabulario).fit_transform(maestradesc).toarray()
datadesc        = preprocessing(get_descs(D))
datadescvec     = CountVectorizer(ngram_range=(3, 3),analyzer='char',vocabulary=vocabulario).fit_transform(datadesc).toarray()
matrix_distance = cosine_distance(datadescvec,maestradescvec)
DESCPROYECTED = [ Mdesc[matrix_distance[index].argsort()[0]] for index in range(len(datadesc))]
D = insert_desc(D,DESCPROYECTED)
save_json(D,stage2_path)


In [18]:
# Separacion de cluster por distancia de coseno
ind=0
bandera = True
repeticiones=0
while bandera:
    E = D.copy() if repeticiones==0 else F.copy()
    F = {}
    # print(len(E))
    CV  = CountVectorizer(ngram_range=(3, 3),analyzer='char',vocabulary=vocabulario)
    for cluster,data in E.items():
        if len(data)>1:
            listdesc=[ desc for _,_,_,[desc,desc_conf],_ in data]
            row = CV.fit_transform(preprocessing(listdesc[:1])).toarray()
            cols = CV.fit_transform(preprocessing(listdesc[1:])).toarray()
            vector = cosine_distance(row,cols)
            valids= np.nonzero(np.array(vector) <= COSINE_DIST_THRESH)[1]+1
            valids = np.append(valids, 0)
            next_data = [d for i,d in enumerate(data) if i not in valids]
            keep_data = [d for i,d in enumerate(data) if i in valids]
            if len(keep_data)==1:
                E[cluster] = {"best":keep_data[0],"others":[]}
            else:
                values=[v for _,v,_,_,_ in keep_data]
                best=keep_data.pop(np.array(values).argmax())
                E[cluster] = {"best":best,"others":keep_data}
            if len(next_data)>0:
                # print(cluster)
                c,ind=get_new_cluster(ind)
                F[c]=next_data
        else:
            E[cluster] = {"best":data[0],"others":[]}
    # for c,d in F.items():
    #     print(c,[ desc for _,_,_,[desc,desc_conf],_ in d])
    if repeticiones==0:
        D = E | F
    else:
        D= D|E|F
    if len(F)==0:
        bandera=False
    repeticiones+=1
save_json(D,stage3_path)