In [1]:
import pandas as pd
import fuzzymatcher
import os
import re
pd.options.mode.chained_assignment = None

In [2]:
path = "../../res"
mapping_path = "../../res/matching"

In [79]:
db_table_path = "../../Database/table"
df_phones = pd.read_csv(os.path.join(db_table_path, "phones.csv"), sep =";")
df_sources = pd.read_csv(os.path.join(db_table_path, "souces.csv"), sep =";")
df_sources.astype(str)
df_phones.astype(str)

warehouse = {}
warehouse['phones'] = df_phones
warehouse['sources'] = df_sources

int_field = ["ram", "rom", "battery", "monitor_frequency", "phone_id", "int", 'price']

In [43]:
def getSource(souce_name, sep=";"):
    df = pd.read_csv(os.path.join(path, souce_name+".csv"), sep = sep)
    df['id'] = df.index
    df.astype(str)
    return df
def getMapping(source_name, sep=";"):
    df = pd.read_csv(os.path.join(mapping_path, source_name+".txt"), sep = sep)
    mapping = {}
    for i in range(df.shape[0]):
        mapping[df['schema'][i]] = df[source_name][i]
    return mapping

In [66]:
cellphones = getSource("cellphones")
mapping_cellphones = getMapping("cellphones")

hhm = getSource("hhm")
hhm['source'] = 'Hoàng Hà mobile'
mapping_hhm = getMapping("hhm")

nk = getSource("nk")
nk['source'] = 'Nguyễn Kim mobile'
mapping_nk = getMapping("nk")


In [6]:
def load_data(source, target, mapping):
    t = target.copy()
    for key in mapping:
        for table in t:
            t[table] = t[table].copy()
            df_table = t[table]
            if key in df_table.columns:
                target_table = df_table
        target_table[key] = source[mapping[key]]

    t['sources']['phone_id'] = source['id']
    t['phones']['id'] = source['id']
    return t


In [45]:
def load_match_data(source, target, id_mapping, mapping):
    t = {}
    phones = target['phones'].copy()
    sources = target['sources'].copy()
    df_not_matched = pd.DataFrame()
    df_matched = pd.DataFrame()

    df_not_matched= source[~source['id'].isin(id_mapping.keys())]
    df_matched = source[source['id'].isin(id_mapping.keys())]

    for index, row in df_matched.iterrows():
        data_row_left = {}
        for field in source.columns:
            data_row_left[field] = row[field]
        id_left = data_row_left['id']
        row_right = phones.loc[phones['id'] == id_mapping[id_left]]
        for index, row in row_right.iterrows():
            for field in phones.columns:
                if field in mapping:
                    if str(phones.at[index, field]) == 'nan':
                        phones.at[index, field] = data_row_left[mapping[field]]

        data_source = {}
        for field in sources.columns:
            if field in mapping:
                data_source[field] = (data_row_left[mapping[field]])
            elif field =='phone_id':
                data_source[field] = (id_mapping[id_left])
            else :
                data_source[field] = ("")

        data_source = pd.DataFrame([data_source])
        sources = pd.concat([sources,data_source] , axis=0, ignore_index=True)


    #if object is not exist in warehouse
    for index, row in df_not_matched.iterrows():
        data_row = {}
        for field in source.columns:
            data_row[field] = row[field]
        data_row['id'] = phones.shape[0]
        data_phone = {}
        for field in phones.columns:
            if field in mapping:
                data_phone[field] = (data_row[mapping[field]])
            elif field == 'id':
                data_phone['id'] = (data_row['id'])
            else :
                data_phone[field] = ("")

        data_source = {}
        for field in sources.columns:
            if field in mapping:
                data_source[field] = (data_row[mapping[field]])
            elif field =='phone_id':
                data_source[field] = (data_row['id'])
            else :
                data_source[field] = ("")

        data_phone = pd.DataFrame([data_phone])
        data_source = pd.DataFrame([data_source])
        phones = pd.concat([phones, data_phone] , axis=0, ignore_index=True)
        sources = pd.concat([sources,data_source] , axis=0, ignore_index=True)

    t['phones'] = phones
    t['sources'] = sources
    return t


In [8]:
def nomalize_data(df, mapping):
    df = df.copy()
    for key in mapping:
        if key in int_field and df[mapping[key]].dtype == 'object':
            for i in range(df.shape[0]):
                a = ''.join(filter(str.isdigit, str(df[mapping[key]][i])))
                df[mapping[key]][i] = a
    return df

In [88]:
def data_matching(left_df, left_col, right_df, right_col):
    df = fuzzymatcher.fuzzy_left_join(left_df, right_df, left_col, right_col, left_id_col = "id", right_id_col = "id")
    return df

In [101]:
def get_data_match(df_source, target, mapping, min_accept=0.5):
    if (target['phones'].shape[0] == 0) :
        return pd.DataFrame()
    target_cols = ["name", "ram", "rom"]
    source_cols = [mapping[x] for x in target_cols]
    data_match = data_matching(df_source , source_cols, target['phones'], target_cols)
    data_match = data_match.sort_values(by=['best_match_score'], ascending=False)
    data_match = data_match[data_match['best_match_score'] > min_accept]
    return data_match

    

In [11]:
def show_data_match(data_match, mapping):
   target_cols = ["name", "ram", "rom"]
   show_col = ['best_match_score', '__id_left', '__id_right']
   for col in target_cols:
      if (mapping[col] == col):
         show_col.append(col+'_left')
         show_col.append(col+'_right')
      else:
         show_col.append(col)
         show_col.append(mapping[col])

   data = data_match[data_match.columns.intersection(show_col)]
   return data

In [55]:
def data_maping(df_source, target, mapping, min_accept=0.5):
    df_source = nomalize_data(df_source,mapping)
    data = get_data_match(df_source, target, mapping, min_accept)

    if (data.empty):
        warehouse = load_data(df_source, target,mapping)
        return warehouse

    id_left = data['__id_left'].to_list()
    id_right = data['__id_right'].to_list()
    
    id_mapping = {}
    for i in range(len(id_left)):
        id_mapping[id_left[i]] = id_right[i]

    t = load_match_data(df_source, target, id_mapping, mapping)
    return t

# MAPPING DATA

In [80]:
warehouse = data_maping(cellphones, warehouse, mapping_cellphones)


In [81]:
warehouse = data_maping(hhm, warehouse, mapping_hhm)

      best_match_score  __id_left  __id_right  \
126           1.043750         20          68   
127           0.889878         21          69   
118           0.889184         16          65   
116           0.849841         14          63   
3484          0.808972        111         300   
115           0.808545         13          62   
107           0.806733          8          50   
106           0.787827          7          49   
120           0.735312         17          66   
130           0.725221         23          71   
128           0.724373         22          55   
131           0.686750         24          78   
111           0.685649         11          61   
3145          0.652095         98         293   
108           0.648881          9          51   
99            0.647605          3          46   
109           0.644353         10          60   
117           0.643888         15          64   
104           0.642563          6          58   
102           0.6425

In [109]:
warehouse = data_maping(nk, warehouse, mapping_nk, 0.1)

In [110]:
warehouse['phones']

Unnamed: 0,id,name,display_size,display_tech,camera,camera_selfie,ram,rom,battery,sim,operating_system,resolution,display_feature,cpu_type,weight,monitor_frequency,cpu,bluetooth,image
0,0,Apple iPhone 8 64GB Chính hãng,4.7 inches,...,12 MP (f/1.8 28mm) tự động lấy nét nhận diệ...,7 MP f/2.2 1080p@30fps 720p@240fps nhận d...,,64,1821,Nano-SIM ...,11 ...,750 x 1334 pixels ...,...,2x 2.39 GHz Monsoon & 4x 2.39 GHz Mistral ...,148 g (5.22 oz) ...,,Apple A11 Bionic APL1W72,5.0 A2DP LE,
1,1,iPhone XR 128GB I Chính hãng VN/A,6.1 inches,IPS LCD ...,12MP ...,7MP ...,3,128,2942,Nano-SIM ...,iOS 12 ...,1792 x 828 pixel ...,True-tone ...,Hexa-core ...,194 g ...,60,Apple A12 Bionic,v5.0 A2DP LE,
2,2,iPhone XR 64GB I Chính hãng VN/A,6.1 inches,IPS LCD ...,12MP ...,7MP ...,3,64,2942,Nano-SIM ...,iOS 14 ...,1792 x 828 pixel ...,True-tone tần số quét 60Hz mật độ điểm ảnh ...,Hexa-core ...,194 g ...,60,Apple A12 Bionic,v5.0 A2DP LE,
3,3,ASUS 8z,5.9 inches,AMOLED ...,Camera chính: 64 MP f/1.73 16 MP ...,12 MP ...,8,128,4000,2 SIM (Nano-SIM) ...,Android 11 ...,1080 x 2400 pixels (FullHD+) ...,Độ sáng tối đa 1100 nits 112% DCI-P3 107% N...,...,169 g ...,120,Qualcomm® Snapdragon™ 888 5G,5.2 (EDR + A2DP) hỗ trợ LDAC Qualco,
4,4,ASUS ROG Phone 2 512GB,6.59 inches,...,48 MP + 13 MP ...,24 MP ...,12,512,600040,Nano-SIM ...,Android 9.0 (Pie) ...,1080 x 2340 pixels (FullHD+) ...,...,64-bit Octa-core ...,240 g ...,120,Snapdragon 855 Plus Mobile Platform,V 5.0 (BR/EDR+LE) hỗ trợ Qualcomm® a,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657,657,Điện thoại iPhone 12 Pro 256GB Xám,"6.1 """,,12 MP 12 MP 12 MP,12 MP,6,256,,,iOS 14.1,,,,,,Apple A14 Bionic,,https://www.nguyenkim.com/dien-thoai-iphone-12...
658,658,Điện thoại iPhone 12 128GB Đỏ,"6.1 inch """,Super Retina XDR,12 MP (kép),12 MP,4,128,,,iOS 14.1,2532 x 1170 pixels,,,,,Apple A14 Bionic,Bluetooth 5.0,https://www.nguyenkim.com/dien-thoai-iphone-12...
659,659,Điện thoại iPhone 12 Pro 256GB Xanh,"6.1 """,,12 MP 12 MP 12 MP,12 MP,6,256,,,iOS 14.1,,,,,,Apple A14 Bionic,,https://www.nguyenkim.com/dien-thoai-iphone-12...
660,660,Điện thoại iPhone 12 64GB Xanh lá,"6.1 inch """,Super Retina XDR,12 MP (kép),12MP,4,64,,,iOS 14.1,2532 x 1170 pixels,,,,,Apple A14 Bionic,Bluetooth 5.0,https://www.nguyenkim.com/dien-thoai-iphone-12...
