In [2]:
# import dependencies here
import csv
import os
import pandas as pd
from pprint import pprint
from typing import Dict, List

In [60]:
# list all constants here
DATA_PATH = '/Volumes/TriveStorage/support_vector_machine/scraper-carsdotcom/byecar_data'
COMBINED_CSV_NAME = 'byecar_combined.csv'

def is_csv(file_path: str) -> bool:
    return os.path.splitext(file_path)[1] == '.csv'

def get_csv_files(file_paths: List[str]) -> List[str]:
    return list(filter(is_csv, file_paths))

def csv_to_dict(row: List[str]) -> Dict:
    return {
        'car_id': row[0],
        'name': row[1],
        'description': row[2],
        'badges': row[3],
        'bids': row[4],
        'price': row[5],
        'timestamp': int(row[6]),
    }

def extract_month(row) -> int:
    try:
        return row['description'].split('월')[0].split('년 ')[1]
    except Exception as e:
        print(e)
        return 0

def extract_year(row) -> int:
    try:
        return int(row['description'].split('년')[0])
    except Exception as e:
        print(e)
        return 0

def extract_bid_count(row) -> int:
    try:
        return int(row['bids'].split('회')[0].split('입찰횟수 ')[1])
    except Exception as e:
        print(e)
        return 0
    
def extract_dealer_count(row) -> int:
    try:
        return int(row['bids'].split('명')[0].split('참여딜러 ')[1])
    except Exception as e:
        print(e)
        return 0
    
def extract_accident(row) -> int:
    return '무사고' in row['badges']

def extract_km(row) -> int:
    try:
        return int(row['description'].split(' / ')[1].replace('km', ''))
    except Exception as e:
        print(e)
        return 0

def extract_location(row) -> str:
    return row['description'].split(' / ')[2]

def convert_price(row) -> int:
    try:
        return int(row['price'].split(' 만원')[0].split('현재가 ')[1].replace(',', '')) * 10_000
    except Exception as e:
        print(e)
        return 0


In [45]:
# combine all separately scraped ByeCar data into a single csv
if os.path.exists(COMBINED_CSV_NAME):
    os.remove(COMBINED_CSV_NAME)
combined_byecar = open(COMBINED_CSV_NAME, 'w')
writer_combined_byecar = csv.writer(combined_byecar, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

csv_names = get_csv_files(os.listdir(DATA_PATH))
for csv_name in csv_names:
    timestamp = int(os.path.splitext(csv_name)[0])
    with open(os.path.join(DATA_PATH, csv_name)) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            row.append(timestamp)
            writer_combined_byecar.writerow(row)
combined_byecar.close()

In [20]:
# load all csv data into an intermediate list of dictionaries (for Pandas)
byecar_data_for_dataframe = []

with open(COMBINED_CSV_NAME) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        byecar_data_for_dataframe.append(csv_to_dict(row))

In [61]:
# convert the combined csv into a pandas dataframe, and format columns
df_byecar = pd.DataFrame(byecar_data_for_dataframe)
df_byecar['accident'] = df_byecar.apply(extract_accident, axis=1)
df_byecar = df_byecar.drop(columns='badges')

df_byecar['dealer_count'] = df_byecar.apply(extract_dealer_count, axis=1)
df_byecar['bid_count'] = df_byecar.apply(extract_bid_count, axis=1)
df_byecar = df_byecar.drop(columns='bids')

df_byecar['year'] = df_byecar.apply(extract_year, axis=1)
df_byecar['month'] = df_byecar.apply(extract_month, axis=1)
df_byecar['km'] = df_byecar.apply(extract_km, axis=1)
df_byecar['location'] = df_byecar.apply(extract_location, axis=1)
df_byecar['price'] = df_byecar.apply(convert_price, axis=1)
df_byecar = df_byecar.drop(columns='description')

df_byecar.head()
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(df_byecar['month'])

Unnamed: 0,car_id,name,price,timestamp,accident,dealer_count,bid_count,year,month,km,location
0,69226,기아 레이 프레스티지,2900000,1547797152,True,8,8,2012,5,39579,광주 북구 동림동
1,69224,포드 익스플로러 2.3 리미티드 4WD,39000000,1547797152,True,2,2,2018,2,17500,서울 강북구 미아동
2,69223,기아 올 뉴 카니발 9인승 프레스티지,16100000,1547797152,True,2,2,2015,7,88105,부산 기장군 정관면
3,69222,렉서스 CT200h 1.8 컴팩트 럭셔리,8600000,1547797152,False,2,2,2011,9,78000,경기 수원시 영통구 영통동
4,69221,르노삼성 SM5 뉴 임프레션 LE,450000,1547797152,False,3,3,2009,12,39000,경기 성남시 수정구 태평동
