In [None]:
import json
import csv
import os
import unicodedata as ud
from pprint import pprint

import requests
from bs4 import BeautifulSoup
import io


In [None]:
def norm(input: str) -> str:
    return ud.normalize('NFC', input)


In [None]:
mapping = {}
with open('matched_names.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        row = [norm(x) for x in row]
        if row[0] and row[2]:
            mapping[row[0].replace('_', '').replace(' ', '').strip()] = row[2].replace('_', '').replace(' ', '').strip()



In [None]:
pricing_data = {}
with open('pricing_data.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        encar_name = norm(row[0].replace('_', '').replace(' ', ''))
        if encar_name in mapping:
            pricing_data[mapping[encar_name]] = int(row[1].replace(',', ''))
        
                  
                  

In [None]:
def get_col(col) -> int:
    if col:
        return int(col)
    else:
        return 1

def to_int(input: str) -> int:
    return int(input.lower().replace(',', '').replace('mm', '').replace('kg', ''))


In [None]:
spec_data = {}
jsm_to_encar = {}
encar_to_jsm = {}
encarbroad_to_jsm = {}
with open('specs.csv') as csv_file:
    next(csv_file)
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        if len(row) >= 6 and row[2] and row[3] and row[4]:
            encar_name = norm(row[1].replace('_', '').replace(' ', ''))
            normalized = norm(row[2])
            if '현재' in normalized:
                i_ref = normalized.find('현재')
                i_start = i_ref - 5
                i_end = i_ref + 3
                date_part = normalized[i_start:i_end]
            elif '년)' in normalized:
                i_ref = normalized.find('년)')
                i_start = i_ref - 6
                i_end = i_ref + 2
                date_part = normalized[i_start:i_end]
            
            words = normalized.replace(date_part, '').split('_')
            
            encarbroad = norm(row[1].replace('_', '').replace(' ', ''))
            
            jsm_name = norm(row[2].replace('_', '').replace(' ', ''))
            spec_data[jsm_name] = {
                'url1': row[3],
                'url2': row[4],
                'col': get_col(row[5]),
                'encar': encar_name,
                'encarbroad': encarbroad,
                'major_name': norm(' '.join([words[0]] + words[2:-1])),
                'minor_name': norm(words[-1]),
                'date_part': date_part,
            }
            jsm_to_encar[jsm_name] = encar_name
            
            if encar_name in encar_to_jsm:
                encar_to_jsm[encar_name].append(jsm_name)
                encar_to_jsm[encar_name] = list(set(encar_to_jsm[encar_name]))
            else:
                encar_to_jsm[encar_name] = [jsm_name]
                
            if encarbroad in encarbroad_to_jsm:
                encarbroad_to_jsm[encarbroad].append(jsm_name)
                encarbroad_to_jsm[encarbroad] = list(set(encarbroad_to_jsm[encarbroad]))
            else:
                encarbroad_to_jsm[encarbroad] = [jsm_name]



In [30]:
already_scraped = {}
full_data = {}
for name in spec_data:
    if name not in pricing_data or name in already_scraped:
        continue
    url = spec_data[name]['url1']
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    stats = [x.get_text() for x in soup.find_all('dl', class_='detail_lst')[0].find_all('dd')]
    
    car_spec = {
        'fuelEfficiency': stats[0],
        'fuel': stats[1],
        'horsepower': stats[2],
    }
    
    url = spec_data[name]['url2']
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    col = soup.find_all('div', class_='lineup_btm_td')[spec_data[name]['col'] - 1]
    list_items = [x.get_text().strip() for x in col.find_all('li')]
    list_items = [x for x in list_items if (x == '정보없음') or (('mm' in x or 'kg' in x) and ('kg.' not in x and 'inch' not in x))]
    
    car_spec['sideOuter'] = list_items[0]
    car_spec['frontOuter'] = list_items[1]
    car_spec['frontHeight'] = list_items[2]
    car_spec['sideInner'] = list_items[3]
    car_spec['frontInner'] = list_items[4]
    car_spec['backInner'] = list_items[5]
    car_spec['weight'] = list_items[6]
    
    car_spec['price'] = pricing_data[name]
    car_spec['name'] = name
    car_spec['majorName'] = spec_data[name]['major_name']
    car_spec['minorName'] = spec_data[name]['minor_name']
    car_spec['datePart'] = spec_data[name]['date_part']
    
    full_data[name] = car_spec
    
    
    already_scraped[name] = True



In [31]:
src = '/Volumes/TriveStorage/code/trive-image-recognition/complete_manual/encar_latest/images'

data = {}

for img in os.listdir(src):
    if img == '.DS_Store' or 'CARMODOO' in img:
        continue
    [vehicle_class, vehicle_year, image_name] = img.split('xxxxx')
    vehicle_class = norm(vehicle_class)
    vehicle_year = norm(vehicle_year)

    if vehicle_class == None or vehicle_year == None:
        continue

    if '(' in vehicle_year:
        to_remove = vehicle_year[vehicle_year.find('년')+1:vehicle_year.find('(')]
        vehicle_year = vehicle_year.replace(to_remove, '')
    elif '년' in vehicle_year:
        vehicle_year = vehicle_year[:vehicle_year.find('년')+1]
        
    label_1 = vehicle_class.replace('_', '').replace(' ', '')
    label_2 = f'{vehicle_class}{vehicle_year}'.replace('_', '').replace(' ', '')
    data[img] = {
        'original': img,
        'label_1': label_1,
        'label_2': label_2,
    }
    
    

In [32]:
for key in data:
    if data[key]['label_2'] in mapping:
        data[key]['mapping'] = mapping[data[key]['label_2']]
    elif data[key]['label_1'] in mapping:
        data[key]['mapping'] = mapping[data[key]['label_1']]



In [33]:
images_mapped = {}
for key in data:
    if 'mapping' in data[key] and data[key]['mapping'] in jsm_to_encar:
        images_mapped[norm(data[key]['original']).replace(' ', '')] = [full_data[x] for x in encar_to_jsm[jsm_to_encar[data[key]['mapping']]] if x in full_data]



In [34]:

with io.open('images_mapped.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(images_mapped, ensure_ascii=False))
    





In [35]:
# for key in images_mapped:
    
#     print(key, len(images_mapped[key]))

