In [1]:
import boto3

In [2]:
table_name = "car_listings"

client = boto3.client('dynamodb')
DB = boto3.resource('dynamodb')
table = DB.Table(table_name)

### Describe Table

In [3]:
response = client.describe_table(TableName=table_name)
# Getting number of items, could be potential ID
response['Table']['ItemCount']

0

In [4]:
import pandas as pd
from os import listdir
import re

def capitalize(make):
    
    make = re.sub("_", " ", make)
    make = make.split(" ")
    
    make = [i.capitalize() for i in make]
    
    make = " ".join(make)
    
    return make

def extract_date(name):
    return int(re.search(r"(\d{4})", name).group(1))

def clean_mileage(miles):
    
    mileage = re.sub(',', '', miles)
    mileage = re.sub(' mi.', '', mileage)
    
    return int(mileage)

def clean_rating_count(rating_count):
    
    words_to_remove = ['\(', '\)', ' reviews', ',', ' review']
    
    for i in words_to_remove:
        rating_count = re.sub(i, '', rating_count)
    
    return int(rating_count)

def clean_price(price):
    
    price = re.sub(',', '', price)
    price = price.strip('$')
    
    if price == 'Not Priced':
        return None
    
    return int(price)


# Aggregating and Cleaning the data

files = listdir('data')

data_all = []

for i in files:

    if '.png' in i:
        continue

    file_path = "data/{}".format(i)
    
    df = pd.read_csv(file_path, on_bad_lines = 'skip')
    df['Make'] = i[:-4]
    df = df.drop(columns=["Unnamed: 0"])
    
    data_all.append(df)
    
df_all = pd.concat(data_all, axis = 0, ignore_index=True)

df_transform = df_all.copy()
df_transform = df_transform.dropna()

df_transform['Make'] = df_all['Make'].apply(lambda x: capitalize(x))
df_transform['Name'].apply(lambda x: extract_date(x))
df_transform['Year'] = df_all['Name'].apply(lambda x: extract_date(x))
df_transform['Mileage'] = df_transform['Mileage'].apply(lambda x: clean_mileage(x))
df_transform['Rating Count'] = df_transform['Rating Count'].apply(lambda x: clean_rating_count(x))
df_transform['Price'] = df_transform['Price'].apply(lambda x: clean_price(x))

df_transform = df_transform.dropna()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte

In [None]:
df_transform.head()

NameError: name 'df_transform' is not defined

In [None]:
df_transform.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117404 entries, 0 to 119943
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Name          117404 non-null  object 
 1   Mileage       117404 non-null  int64  
 2   Dealer Name   117404 non-null  object 
 3   Rating        117404 non-null  float64
 4   Rating Count  117404 non-null  int64  
 5   Price         117404 non-null  float64
 6   Make          117404 non-null  object 
 7   Year          117404 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 8.1+ MB


In [None]:
from datetime import datetime
import json
from decimal import Decimal

today = datetime.now().date().isoformat()
today

'2022-08-06'

In [None]:
item = df_transform.iloc[0].to_dict()
item['Date'] = today

item = json.loads(json.dumps(item), parse_float = Decimal)
item

{'Name': '2020 Acura TLX Technology',
 'Mileage': 13214,
 'Dealer Name': 'Piazza Acura of Ardmore',
 'Rating': Decimal('4.6'),
 'Rating Count': 138,
 'Price': Decimal('31999.0'),
 'Make': 'Acura',
 'Year': 2020,
 'Date': '2022-08-06'}

In [None]:
### Base Syncronous Code

In [None]:
primary_key = 0 

with table.batch_writer() as batch:
    
    for i, row in df_transform.iterrows():

        item = row.to_dict()
        item['Date'] = today
        item['ID'] = primary_key
        primary_key += 1
        
        item = json.loads(json.dumps(item), parse_float = Decimal)


        batch.put_item(
            Item = item
        )

In [None]:
scan = table.scan()
with table.batch_writer() as batch:
    for each in scan['Items']:
        batch.delete_item(
            Key={
                'ID': each['ID']
            }
        )