In [4]:
import boto3
import pandas as pd
from os import listdir
import re

In [5]:
table_name = "car_listings"

client = boto3.client('dynamodb')
DB = boto3.resource('dynamodb')
table = DB.Table(table_name)

In [7]:
def capitalize(make):
    
    make = re.sub("_", " ", make)
    make = make.split(" ")
    
    make = [i.capitalize() for i in make]
    
    make = " ".join(make)
    
    return make

def extract_date(name):
    return int(re.search(r"(\d{4})", name).group(1))

def clean_mileage(miles):
    
    mileage = re.sub(',', '', miles)
    mileage = re.sub(' mi.', '', mileage)
    
    return int(mileage)

def clean_rating_count(rating_count):
    
    words_to_remove = ['\(', '\)', ' reviews', ',', ' review']
    
    for i in words_to_remove:
        rating_count = re.sub(i, '', rating_count)
    
    return int(rating_count)

def clean_price(price):
    
    price = re.sub(',', '', price)
    price = price.strip('$')
    
    if price == 'Not Priced':
        return None
    
    return int(price)


# Aggregating and Cleaning the data

files = listdir('data')

data_all = []

for i in files:

    if '.png' in i or '.txt' in i:
        continue

    file_path = "data/{}".format(i)
    
    df = pd.read_csv(file_path, on_bad_lines = 'skip')
    df['Make'] = i[:-4]
    df = df.drop(columns=["Unnamed: 0"])
    
    data_all.append(df)
    
df_all = pd.concat(data_all, axis = 0, ignore_index=True)

df_transform = df_all.copy()
df_transform = df_transform.dropna()

df_transform['Make'] = df_all['Make'].apply(lambda x: capitalize(x))
df_transform['Name'].apply(lambda x: extract_date(x))
df_transform['Year'] = df_all['Name'].apply(lambda x: extract_date(x))
df_transform['Mileage'] = df_transform['Mileage'].apply(lambda x: clean_mileage(x))
df_transform['Rating Count'] = df_transform['Rating Count'].apply(lambda x: clean_rating_count(x))
df_transform['Price'] = df_transform['Price'].apply(lambda x: clean_price(x))

df_transform = df_transform.dropna()

In [8]:
df_transform.head()

Unnamed: 0,Name,Mileage,Dealer Name,Rating,Rating Count,Price,Make,Year
0,2020 Acura TLX FWD,20051,Niello Acura,4.6,132,32568.0,Acura,2020
1,2022 Acura MDX Technology,19061,Ed Voyles Acura,3.9,86,58991.0,Acura,2022
2,2020 Acura TLX FWD,30131,Hiley Acura,3.7,61,31445.0,Acura,2020
3,2019 Acura RDX,12097,Acura of Boston,4.7,600,40595.0,Acura,2019
4,2021 Acura RDX A-Spec,16250,Fountain Acura,4.4,314,43997.0,Acura,2021


In [9]:
df_transform.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119643 entries, 0 to 122244
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Name          119643 non-null  object 
 1   Mileage       119643 non-null  int64  
 2   Dealer Name   119643 non-null  object 
 3   Rating        119643 non-null  float64
 4   Rating Count  119643 non-null  int64  
 5   Price         119643 non-null  float64
 6   Make          119643 non-null  object 
 7   Year          119643 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 8.2+ MB


In [10]:
from datetime import datetime
import json
from decimal import Decimal

today = datetime.now().date().isoformat()
today

'2022-08-06'

In [11]:
item = df_transform.iloc[0].to_dict()
item['Date'] = today

item = json.loads(json.dumps(item), parse_float = Decimal)
item

{'Name': '2020 Acura TLX FWD',
 'Mileage': 20051,
 'Dealer Name': 'Niello Acura',
 'Rating': Decimal('4.6'),
 'Rating Count': 132,
 'Price': Decimal('32568.0'),
 'Make': 'Acura',
 'Year': 2020,
 'Date': '2022-08-06'}

### Uploading Records to DynamoDB


In [None]:
from botocore.exceptions import ValidationError

with table.batch_writer() as batch:
    

    for i, row in df_transform.iterrows():

        item = row.to_dict()
        item['Date'] = today
        item['ID'] = "{}-{}-{}".format(row['Name'], row['Price'], row['Mileage'])

        
        item = json.loads(json.dumps(item), parse_float = Decimal)

        try:

            batch.put_item(
                Item = item
            )

        except Exception as error:
            continue

        