# 1. Defining features

In [1]:
use_cached_tables = 1
use_cached_data = 1
use_cached_engine_scores = 1
use_cached_vehicle_scores = 1

In [2]:
vehicle_features = {'make': 'make_id', 'model': 'model_id', 'year_from': 'from', 'year_to': 'to', 
                    'model_code': 'uvdb_model_codes.id','body': 'body_type_id', 'drive': 'drive_type_id',
                    'engine': 'defined_engine_id'}

In [3]:
supercheap_vehicle_features = ['make', 'year', 'model', 'model_code', 'body', 'drive_id']

In [4]:
engine_features = {'capacity': 'cc', 'designation': 'uvdb_defined_engine_designations.engine_designation_id',
                   'block': 'block_type', 'cylinders': 'cylinders', 'valves': 'valves_id',
                   'head': 'cylinder_head_type_id', 'aspiration': 'aspiration_id', 'delivery': 'fuel_delivery_subtype_id',
                   'power': 'power_output_id'}

In [5]:
engine_feature_types = {'capacity': 'float', 'cylinders': 'int', 'valves': 'int', 'power': 'float', 'delivery': 'string'}

In [6]:
vehicle_feature_types = {'make': 'string', 'year': 'int', 'from': 'int', 'to': 'int', 'model': 'string', 'model_code': 'string',
                        'body': 'string', 'drive_id': 'int', 'defined_engine_id': 'string'}

In [7]:
from collections import OrderedDict
from sqlalchemy import create_engine
from Levenshtein import distance
import numpy as np
import pandas as pd
import numba as nb
import scipy as sp
import pymysql, time, re, math, pickle, h5py, os
from scipy.sparse import csr_matrix
from IPython.core.debugger import Tracer
# import warnings
# warnings.filterwarnings('ignore')

In [8]:
from data_tools import data_tools

In [9]:
config = {
    'user': 'staging',
    'passwd': '$dsaGSD92&76',
    'host': '10.106.48.3',
    'port': '3306',
    'dbs': ["ebay.supercheap_data_2", "partly_staging"],
    'readdb': use_cached_tables ^ 1
}

In [10]:
tools = data_tools(config)

# 2. Loading tables

In [11]:
t0 = time.time()
if use_cached_tables == 1:
    with open('Data/table_cache.dat', 'rb') as fh:
        df = pickle.load(fh)
else:
    df = tools.read_df_from_table()
    with open('Data/table_cache.dat', 'wb') as fh:
        pickle.dump(df, fh) 
t1 = time.time()
total = t1-t0
print(total)

0.5572447776794434


# 3. Loading data

In [12]:
t0 = time.time()
if use_cached_data == 1:
    with open('Data/data_cache.dat', 'rb') as fh:
        data = pickle.load(fh)
else:
    data = tools.transform_data_from_tables(df, vehicle_features, supercheap_vehicle_features, engine_features)
    with open('Data/data_cache.dat', 'wb') as fh:
        pickle.dump(data, fh) 
t1 = time.time()
total = t1-t0
print(total)

0.2353827953338623


# 4. Batch processing

In [13]:
engine_factors = {
    'designation': 20,
    'capacity': 9, 
    'power': 3, 
    'block': 9, 
    'cylinders': 10, 
    'valves': 8, 
    'head': 9, 
    'aspiration': 6, 
    'delivery': 5
}

In [14]:
vehicle_factors = {
    'make': 3,
    'year': 7,
    'model': 10,
    'model_code': 12,
    'body': 7,
    'drive': 10
}

In [16]:
def get_batch_data(data, start, end, white_list):
    res = {}
    for key, df in data.items():
        if key in white_list:
            res[key] = df[start:end]
        else:
            res[key] = df[:]
    return res

In [17]:
def calculate_one_batch(start, end, full_data, engine_factors, vehicle_factors):
    top_scores = {}
    size = end - start
    # Getting a batch of data
    data = get_batch_data(full_data, start, end, ['super_vehicle', 'super_engine'])
    # Calculating engnies scores
    t0 = time.time()
    if use_cached_engine_scores == 1:
        with open(f'Data/engine_scores_{start}_{end}.dat', 'rb') as fh:
            engine_scores = pickle.load(fh)
    else:
        engine_scores = tools.calculate_engine_scores(data, size, engine_feature_types)
        with open(f'Data/engine_scores_{start}_{end}.dat', 'wb') as fh:
            pickle.dump(engine_scores, fh)
    t1 = time.time()
    total = t1-t0
    print(f"engine: {total}")
    
    # Aggregating engine scores
    engine_score = None
    for key, value in engine_factors.items():
        if engine_score is None:
            engine_score = engine_scores[key] * value
        else:
            engine_score += engine_scores[key] * value
            
    # Getting Top 5 of engine scores
    top_scores['engine'] = tools.get_top_k(engine_score, 5, data['super_engine'], data['defined_engine'])
    
    # Adding engine infomation to supercheap vehicle
    data['super_vehicle'] = tools.add_engine_info(top_scores['engine'], data['super_vehicle'])
    
    # Calculating scores for vehicles
    t0 = time.time()
    if use_cached_vehicle_scores == 1:
        with open(f'Data/vehicle_scores_{start}_{end}.dat', 'rb') as fh:
            vehicle_scores = pickle.load(fh)
    else:
        vehicle_scores = tools.calculate_vehicle_scores(data, size, vehicle_feature_types)
        with open(f'Data/vehicle_scores_{start}_{end}.dat', 'wb') as fh:
            pickle.dump(vehicle_scores, fh) 
    t1 = time.time()
    total = t1-t0
    print(f"vehicle engine: {total}")
    
    # Aggregating vehicle scores
    vehicle_score = None
    for key, value in vehicle_factors.items():
        if vehicle_score is None:
            vehicle_score = vehicle_scores[key] * value
        else:
            vehicle_score += vehicle_scores[key] * value
            
    ## Getting top 5 of vehicle mapping
    top_scores['vehicle'] = tools.get_top_k(vehicle_score, 5, data['super_vehicle'], data['full_vehicle'])
    
    return top_scores

In [20]:
pd.options.mode.chained_assignment = None  # default='warn'
path = './Data/csr_matrix.h5'
if os.path.isfile(path):
    os.remove(path)
full_size = data['super_vehicle'].shape[0]
full_size = 20
batch_size = 1e3
batch_size = 5
ascii_type = h5py.string_dtype('ascii', 70)
with h5py.File(path, "a") as f: 
    engine_scores_dset = f.create_dataset('engine_scores', (full_size, 2), maxshape=(None, 2), 
          dtype=ascii_type, chunks=(batch_size, 2))
    vehicle_scores_dset = f.create_dataset('vehicle_scores', (full_size, 2), maxshape=(None, 2), 
          dtype=ascii_type, chunks=(batch_size, 2)) 

    batch_cursor = 0
    while batch_cursor < full_size:
        start = int(batch_cursor)
        end = int(batch_cursor + batch_size)
        if end > full_size:
            end = full_size
        print(f"processing {start} : {end}")
        each_result = calculate_one_batch(start, end, data, engine_factors, vehicle_factors)
        # Append data here
        engine_scores_dset[start: end] = each_result['engine'].astype(ascii_type)
        vehicle_scores_dset[start: end] = each_result['vehicle'].astype(ascii_type)
        batch_cursor = end
pd.options.mode.chained_assignment = 'warn'

processing 0 : 5
0.6702170372009277
cal engine: 0.674724817276001
1.5322656631469727
vehicle engine: 1.5369303226470947
processing 5 : 10
0.6581687927246094
cal engine: 0.6608150005340576
1.5108158588409424
vehicle engine: 1.5151381492614746
processing 10 : 13
0.39811158180236816
cal engine: 0.39987826347351074
0.9407675266265869
vehicle engine: 0.9432008266448975
