In [35]:
import argparse
import codecs
import csv
import glob
import json
import math
import os
import re
import datetime

import numpy as np
import pandas as pd
import tqdm
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer

import helper as pp

In [36]:
tdf = pd.DataFrame({"A": [1,2,3,4,5,'',''], "B": [1,2,np.nan,4,np.nan,np.nan,np.nan]})
tdf2 = pd.DataFrame({"A": [1,2,'',4,5,6,7], "B": [1234234,2234234324,3223423,4232323,523323,1212,np.nan]})
tdf3 = pd.DataFrame({"A": [1,2,3,4,'',6,7], "B": [1,2,3,4,5,6,7]})

In [48]:
def create_unique_id(head, tail):
    r = ''
    try:
        if head:
            r = str(head) + ''.join(tail)
    except TypeError:
        r = head
    return r

In [37]:
def save_string_join(x):
    try:
        joined = ''.join(x)
    except TypeError:
        joined = x
    return joined

In [38]:
def check_if_list_or_tuple(object):
    return isinstance(object, (list, tuple))

In [39]:
def check_if_string(object):
    return isinstance(object, str)

In [40]:
def non_or_empty(x):
    def bitwise_non_or_empty(x):
        boo = False
        if pd.isna(x):
            boo = True
        if x == None:
            boo = True
        if x == '':
            boo = True
        return boo
    boo = x.apply(
        lambda x: bitwise_non_or_empty(x))
    return boo

In [41]:
def check_threshold(x, y, threshold):
    x = x.copy()
    y = y.copy()
    x[abs(x - y) > threshold] = np.nan
    return x

In [43]:
def join_and_update(left, right, left_on, right_on,
                    left_update, right_update, joined_on=""):
    
    assert(check_if_list_or_tuple(
        left_update) == check_if_list_or_tuple(
        right_update))
    
    if right_on == 'Index':
        right['Index'] = right.index
        
    if left_on == 'Index':
        left['Index'] = left.index

    if check_if_string(left_update):
        left_update = [left_update]
        
    if check_if_string(right_update):
        right_update = [right_update]
        
    left = left.copy()
    righ = right.copy()
            
    if 'Joined_on' not in left.columns:
        left['Joined_on'] = np.nan
        
    left[left_on] = left[left_on].replace('', np.nan)
    
    left_to_match = left[non_or_empty(left[left_update[0]])]
    left_matched = left[~non_or_empty(left[left_update[0]])]
    
    mb = left_to_match.shape[0]
    
    cols = [right_on]

    for i in right_update:
        cols.append(i)

    right = right[cols].copy()
    right[right_on] = right[right_on].replace('', np.nan)

    df_join = left_to_match.pipe(save_join, right=right, left_on=left_on,
                        right_on=right_on,
                        suffixes=['', '___y'])

    right_update = [i + '___y' for i in right_update]
    
    check = (non_or_empty(
        df_join[left_update[0]])) & (~non_or_empty(
            df_join[right_update[0]]))

    df_join.loc[check, 'Joined_on'] = joined_on
    
    for i, j in zip(left_update, right_update):
        df_join.loc[check, i] = df_join[j]
    
    df_join = df_join[[i for i in df_join.columns if not i.endswith('___y')]]
    
    ma = df_join[~non_or_empty(df_join[left_update[0]])].shape[0]
    d = mb - ma
    mb += 1.0e-10
    print('Matched {} articles from {} ({} %)'.format(d, round(mb,0),
                                                      round(d/mb*100, 2)))
    
    left = pd.concat([left_matched, df_join], axis=0).reset_index(drop=True)
    return left

In [9]:
def save_join(left, right, left_on, right_on, *args, **kwargs): 

    if right_on == 'Index':
        right['Index'] = right.index
        
    if left_on == 'Index':
        left['Index'] = left.index

    noe = non_or_empty(right[right_on])
    
    df_join = left.merge(right[~noe], how="left",
                         left_on=left_on, right_on=right_on,
                         *args, **kwargs)
    return df_join

In [49]:
def prep_dataframe(df):
    
    df = df.drop_duplicates()
    
    for i in ['FarbId', 'AusführungsId']:
        df.loc[non_or_empty(df[i]), i] = '' 
    
    df['Farbe'] = df['AF_Txt'].fillna('')
    df['Ausführung'] = df['AFZ_Txt'].fillna('')
    df['FarbId'] = df['FarbId'].replace('', '000')
    df['Preis'] = df['Preis_Pos'].astype(float)
    df['Art_Nr_Hersteller'].astype(str, inplace=True)
    df['Art_Nr_Hersteller'].fillna('', inplace=True)
        
    df['UID'] = df[['ArtikelId',
                   'FarbId',
                   'AusführungsId']].apply(
        lambda x: create_unique_id(x), axis=1)
    
    df['Art_Nr_Hersteller_UID'] = df[['Art_Nr_Hersteller',
                                      'FarbId',
                                      'AusführungsId']].apply(
        lambda x: create_unique_id(x), axis=1)
    
    if 'Konkurrenzummer' in df.columns:
        df['Konkurrenznummer'].astype(str, inplace=True)
        df['Konkurrenznummer'].fillna('', inplace=True)
        df['Konkurrenznummer'] = df[['Konkurrenznummer',
                                      'FarbId',
                                      'AusführungsId']].apply(
        lambda x: create_unique_id(x), axis=1)
    
    df['Art_Nr_Hersteller'].replace('', np.nan, inplace=True)
    df['Art_Nr_Hersteller_UID'].replace('', np.nan, inplace=True)
    
    df.loc[df['Art_Nr_Hersteller'].str.len() < 5, 'Art_Nr_Hersteller'] = np.nan
    df.loc[df['Art_Nr_Hersteller_UID'].str.len() < 5, 'Art_Nr_Hersteller_UID'] = np.nan
    
    df['EAN'] = df['Preis_EAN'].fillna(df['Art_Nr_EAN'])
    return df

In [50]:
def join_dotdat(df, dotdat, right_on, left_on):
    df = df.pipe(save_join, right = dotdat,
                 right_on=right_on, left_on=left_on,
                 suffixes=['', '___y'])
    
    def clean_number(x):
        try:
            c = re.sub("\s|\D", "", x)
        except TypeError:
            c = ''
        return c[:6]
    
    dotdat['Konkurrenznummer'] = dotdat['Konkurrenznummer'].apply(
        lambda x: clean_number(x))
    
    try:
        df['Konkurrenznummer'] =  df['Konkurrenznummer___y']
    except KeyError:
        pass
    df = df[[i for i in df.columns if not i.endswith('___y')]]
    return df

In [51]:
def get_closest(left, right, chunksize=5000,
                threshold=0.5, n_jobs=1, method='cosine',
                columns = ['Art_Txt_Lang', 'Art_Txt_Kurz',
                          'Farbe', 'Ausführung']):
   
    n_jobs = max(1, n_jobs)
    vec = CountVectorizer()
    
    ix = left.index
    
    X = vec.fit_transform(left[columns].fillna('').astype(
        str).apply(lambda x: ' '.join(x), axis=1))
    Y = vec.transform(right[columns].fillna('').astype(
        str).apply(lambda x: ' '.join(x), axis=1))
    
    arr = np.empty((X.shape[0], 2))
    print('Remaining Columns to match = {} ({} Batches)\n'.format(
        X.shape[0], math.ceil(X.shape[0] / chunksize)))
    
    for i, a in tqdm.tqdm(zip(pp.batch(X, chunksize),
                              pp.batch(arr, chunksize))):
        distance = pairwise_distances(i, Y, metric='cosine', n_jobs=n_jobs)
        distance_min = distance.min(axis=1)
        distance_argmin = distance.argmin(axis=1)
        a[:, 0] = distance_min
        a[:, 1] = distance_argmin
        
    distance_df = pd.DataFrame(
        arr,
        columns=['Distance', 'Closest'],
        index=ix)
    
    distance_df['Closest'].astype(np.int, inplace=True)
    distance_df = distance_df[distance_df['Distance'] < threshold]
    return distance_df

In [52]:
def join_prices(left, right, UID=False):
    for i in [('UID', 'UID'),
              ('EAN', 'EAN'),
              ('Art_Nr_Hersteller_UID', 'Art_Nr_Hersteller_UID'),
              ('Art_Nr_Hersteller', 'Art_Nr_Hersteller'),
              ('Konkurrenznummer', 'UID')]:
        if i[0] == 'UID' and not UID:
            print("Not joining on UID")
            continue
        else:
            print('Joining on {} and {}'.format(i[0], i[1]))
            left = left.pipe(join_and_update, right,
                             left_on=i[0], right_on=i[1],
                             left_update=['Preis_Konkurrenz','Txt_Kurz_Konkurrenz','Txt_Lang_Konkurrenz'],
                             right_update=['Preis','Art_Txt_Kurz', 'Art_Txt_Lang'],
                             joined_on=i[0])
    return left

In [53]:
def join_on_distance(left, right, distance):
    left = left.join(distance, rsuffix='___y')
    left = left.pipe(join_and_update, right,
                     left_on="Closest",
                     right_on='Index',
                     left_update=['Preis_Konkurrenz','Txt_Kurz_Konkurrenz','Txt_Lang_Konkurrenz'],
                     right_update=['Preis','Art_Txt_Kurz', 'Art_Txt_Lang'],
                     joined_on='Text_Similarity')
    return left

In [15]:
richner = pp.csv_to_pandas(os.path.join("Output", "Richner-6150.csv"))

In [16]:
richner['Preis_Konkurrenz'] = np.nan
richner['Konkurrenz'] = 'Sanitas'
dotdat = pp.csv_to_pandas(os.path.join("Files", "Dotdat", "2018-01-19_SGVSB_Dotdat_File.csv"))
richner = richner.pipe(join_dotdat, dotdat, left_on="ArtikelId", right_on="Artikelnummer")

richner = richner.pipe(prep_dataframe)

In [17]:
def get_files_dict(companies):
    files = os.listdir(os.path.join('Output'))
    if check_if_string(companies):
        companies = [companies]
    compiler = re.compile(
        r'.*(' + '|'.join(companies).lower() + ')(?!Badmoebel).+')
    files_to_match = [f for f in files if compiler.match(f.lower())]

    files_to_match = {os.path.split(
        i)[-1].split('-')[0]: j for i, j in zip(
        files_to_match, files_to_match)}
    return files_to_match

In [18]:
files_to_match = get_files_dict(['Sabag', 'TeamSaniDusch'])

In [19]:
files_to_match

{'Sabag': 'Sabag-6160.csv', 'TeamSaniDusch': 'TeamSaniDusch-6180.csv'}

In [20]:
def get_price_distance(df):
    if "Preisdifferenz" not in df.columns:
        df['Preisdifferenz'] = np.nan
    df['Preisdifferenz'] = df['Preis'] - df['Preis_Konkurrenz']
    return df

In [21]:
def get_rank(df):
    assert("Preisdifferenz" in df.columns)
    rnk = df.sort_values(
        ['UID', 'Preisdifferenz']).groupby(
        'UID')['Preisdifferenz'].rank(
        method='first')
    
    df['Rank'] = rnk
    return df

In [22]:
def delete_with_threshold(df, to_delete, replace=np.nan, threshold=0.5):
    assert("Preisdifferenz" in df.columns)
    assert("Preis" in df.columns)
    check = abs(df['Preisdifferenz'] / df['Preis']) > threshold
    if check_if_list_or_tuple(to_delete):
        for i in to_delte:
            df.loc[check, i] = replace
    else:
        df.loc[check, to_delete] = replace
    return df

In [23]:
def delete_by_rank(df):
    df = df[df['Rank'] == 1]
    return df

In [24]:
def loop_companies(df, dictionary, concat_df=pd.DataFrame(), UID=False):
    for i in dictionary:
        df_ = df.copy()
        print('Preparing {}'.format(i))
        compare_df = pp.csv_to_pandas(os.path.join("Output", dictionary[i]))
        df_['Preis_Konkurrenz'] = np.nan
        df_['Txt_Kurz_Konkurrenz'] = np.nan
        df_['Txt_Lang_Konkurrenz'] = np.nan
        df_['Konkurrenz'] = i
        compare_df = compare_df.pipe(prep_dataframe)    
        df_ = df_.pipe(join_prices, right=compare_df, UID=UID)
        distance = df_[pd.isna(df_['Preis_Konkurrenz'])].pipe(get_closest,
                                                            compare_df, threshold=0.3)      
        df_ = (df_.pipe(join_on_distance, compare_df, distance)
                  .pipe(get_price_distance)
                  .pipe(delete_with_threshold, 'Preis_Konkurrenz')
                  .pipe(get_rank)
                  .pipe(delete_by_rank))        
        concat_df = pd.concat([concat_df, df_], axis=0)
    return concat_df

In [44]:
def to_pivot(df):
    pvt = pd.DataFrame()
    cols = ['Preis_Konkurrenz',
            'Txt_Kurz_Konkurrenz',
            'Txt_Lang_Konkurrenz']
    for i in cols:
        pvt_ = final.pivot(index='UID', columns='Konkurrenz', values=i)
        pvt = pd.concat([pvt, pvt_], axis=1)
    pvt = final.merge(
        pvt, left_on='UID',
        right_index=True)
    pvt.drop(cols, axis=1, inplace=True)
    pvt.drop('Konkurrenz', axis=1, inplace=True)
    pvt.drop_duplicates(inplace=True)
    return pvt

In [25]:
companies = get_files_dict(['Sabag','Saneo',
                            'SaniDusch','TeamHug'])
final = pd.concat([richner.pipe(loop_companies,
                        companies,
                        UID=True),
                   richner.pipe(loop_companies,
                        get_files_dict('Sanitas'),
                        UID=False)], axis=0)

Preparing Sabag
Joining on UID and UID
Matched 8984 articles from 159236.0000000001 (5.64 %)
Joining on EAN and EAN
Matched 8973 articles from 8984.0000000001 (99.88 %)
Joining on Art_Nr_Hersteller_UID and Art_Nr_Hersteller_UID
Matched 8939 articles from 8973.0000000001 (99.62 %)
Joining on Art_Nr_Hersteller and Art_Nr_Hersteller
Matched 8936 articles from 8939.0000000001 (99.97 %)
Joining on Konkurrenznummer and UID
Matched 8936 articles from 8936.0000000001 (100.0 %)
Remaining Columns to match = 8936 (2 Batches)



2it [00:57, 30.71s/it]


Matched 4927 articles from 8936.0000000001 (55.14 %)
Preparing Saneo
Joining on UID and UID
Matched 11504 articles from 159236.0000000001 (7.22 %)
Joining on EAN and EAN
Matched 11493 articles from 11504.0000000001 (99.9 %)
Joining on Art_Nr_Hersteller_UID and Art_Nr_Hersteller_UID
Matched 11465 articles from 11493.0000000001 (99.76 %)
Joining on Art_Nr_Hersteller and Art_Nr_Hersteller
Matched 11462 articles from 11465.0000000001 (99.97 %)
Joining on Konkurrenznummer and UID
Matched 11462 articles from 11462.0000000001 (100.0 %)
Remaining Columns to match = 11462 (3 Batches)



3it [01:29, 30.60s/it]


Matched 6921 articles from 11462.0000000001 (60.38 %)
Preparing TeamHug
Joining on UID and UID
Matched 10691 articles from 159236.0000000001 (6.71 %)
Joining on EAN and EAN
Matched 10680 articles from 10691.0000000001 (99.9 %)
Joining on Art_Nr_Hersteller_UID and Art_Nr_Hersteller_UID
Matched 10641 articles from 10680.0000000001 (99.63 %)
Joining on Art_Nr_Hersteller and Art_Nr_Hersteller
Matched 10646 articles from 10649.0000000001 (99.97 %)
Joining on Konkurrenznummer and UID
Matched 10646 articles from 10646.0000000001 (100.0 %)
Remaining Columns to match = 10646 (3 Batches)



3it [01:12, 25.00s/it]


Matched 5433 articles from 10646.0000000001 (51.03 %)
Preparing TeamSaniDusch
Joining on UID and UID
Matched 17812 articles from 159236.0000000001 (11.19 %)
Joining on EAN and EAN
Matched 17801 articles from 17812.0000000001 (99.94 %)
Joining on Art_Nr_Hersteller_UID and Art_Nr_Hersteller_UID
Matched 17757 articles from 17801.0000000001 (99.75 %)
Joining on Art_Nr_Hersteller and Art_Nr_Hersteller
Matched 17762 articles from 17765.0000000001 (99.98 %)
Joining on Konkurrenznummer and UID
Matched 17762 articles from 17762.0000000001 (100.0 %)
Remaining Columns to match = 17762 (4 Batches)



4it [02:00, 28.72s/it]


Matched 10465 articles from 17762.0000000001 (58.92 %)
Preparing Sanitas
Not joining on UID
Joining on EAN and EAN
Matched 149595 articles from 159236.0000000001 (93.95 %)
Joining on Art_Nr_Hersteller_UID and Art_Nr_Hersteller_UID
Matched 148963 articles from 150560.0000000001 (98.94 %)
Joining on Art_Nr_Hersteller and Art_Nr_Hersteller
Matched 148862 articles from 149070.0000000001 (99.86 %)
Joining on Konkurrenznummer and UID
Matched 148868 articles from 148869.0000000001 (100.0 %)
Remaining Columns to match = 148868 (30 Batches)



30it [04:54,  8.44s/it]


Matched 98648 articles from 148868.0000000001 (66.27 %)


In [26]:
get_files_dict('Sanitas')

{'Sanitas': 'Sanitas-6130.csv'}

In [27]:
print(
    """{}{}{}Article Matching for Price Comparison
    \n\u00a9 Dominik Peter{}{}{}""".format(
        "\n"*2, "#"*80, "\n"*2, "\n"*2, "#"*80, "\n"*2))



################################################################################

Article Matching for Price Comparison
    
© Dominik Peter

################################################################################




In [45]:
final = final[['ArtikelId', 'FarbId', 'AusführungsId', 'UID',
               'Art_Txt_Kurz', 'Art_Txt_Lang', 'Ausführung', 'Farbe', 'EAN',
               'Konkurrenz', 'Konkurrenznummer', 'Warengruppe', 'Preis', 'Preis_Konkurrenz',
               'Txt_Kurz_Konkurrenz', 'Txt_Lang_Konkurrenz', 'Joined_on',
               'Preisdifferenz', 'Art_Nr_Hersteller_Firma',
               'Category_Level_1', 'Category_Level_2', 'Category_Level_3',
               'Category_Level_4', 'Closest', 'Distance']].fillna('')

In [46]:
dt = datetime.datetime.now()

In [30]:
p = os.path.join("Matched", dt.strftime("%Y-%m-%d")+"_Output.csv")

In [31]:
final.to_csv(p, sep="\t", index=False)

In [33]:
pvt = final.pipe(to_pivot)

In [34]:
pvt.head()

Unnamed: 0,ArtikelId,FarbId,AusführungsId,UID,Art_Txt_Kurz,Art_Txt_Lang,Ausführung,Farbe,EAN,Konkurrenz,...,Txt_Lang_Konkurrenz,Joined_on,Preisdifferenz,Art_Nr_Hersteller_Firma,Category_Level_1,Category_Level_2,Category_Level_3,Category_Level_4,Closest,Distance
0,14671,0,,14671000,Tauchrohr TECEdrainline zu Ablauf flach 146 771,Tauchrohr TECEdrainline zu Ablauf flach 146 771,,,,Sabag,...,Tauchrohr TECEdrainline zu Ablauf flach 146 771,UID,-2.5,,Ersatzteile,zu Duschenelemente,TECEdrainline,TECEdrainline,,
1,14672,0,,14672000,Tauchrohr TECEdrainline zu Ablauf Norm 146 772,Tauchrohr TECEdrainline zu Ablauf Norm 146 772,,,,Sabag,...,Tauchrohr TECEdrainline zu Ablauf Norm 146 772,UID,-2.5,,Ersatzteile,zu Duschenelemente,TECEdrainline,TECEdrainline,,
2,14673,0,,14673000,Tauchrohr TECEdrainline zu Ablauf max und senk...,Tauchrohr TECEdrainline zu Ablauf max und senk...,,,,Sabag,...,Tauchrohr TECEdrainline zu Ablauf max und senk...,UID,-2.5,,Ersatzteile,zu Duschenelemente,TECEdrainline,TECEdrainline,,
3,14674,0,,14674000,Tauchrohr TECEdrainline zu Ablauf superflach 1...,Tauchrohr TECEdrainline zu Ablauf superflach 1...,,,,Sabag,...,Tauchrohr TECEdrainline zu Ablauf superflach 1...,UID,-2.5,,Ersatzteile,zu Duschenelemente,TECEdrainline,TECEdrainline,,
4,14675,0,,14675000,Membran-Geruchsverschluss für Ablauf TECEdrain...,Membran-Geruchsverschluss für Ablauf TECEdrain...,,,,Sabag,...,Membran-Geruchsverschluss für Ablauf TECEdrain...,UID,-3.5,,Ersatzteile,zu Duschenelemente,TECEdrainline,TECEdrainline,,
