# I will ingest a CSV into postgresql and normalize it.
# Or will I normalize it partly in python and then ingest and do finishing touches?
First creating and initiating our venv:

In [253]:
# python3 -m venv venv


In [254]:
# . venv/bin/activate

Let's ingest to a df

In [255]:
# pip install pandas

In [256]:
import pandas as pd

df = pd.read_csv("/Users/bfaris96/Desktop/turing-proj/cars_db/FINAL_SPINNY_900.csv")

In [257]:
df.shape

(976, 20)

In [258]:
# df.head(15)

Checking for any nulls:

In [259]:
print(df[df.isnull().any(axis=1)])

Empty DataFrame
Columns: [Car_Name, Make, Model, Make_Year, Color, Body_Type, Mileage_Run, No_of_Owners, Seating_Capacity, Fuel_Type, Fuel_Tank_Capacity(L), Engine_Type, CC_Displacement, Transmission, Transmission_Type, Power(BHP), Torque(Nm), Mileage(kmpl), Emission, Price]
Index: []


Stripping extra whitespace:

In [260]:
df = df.applymap(lambda x: ' '.join(x.split()) if isinstance(x, str) else x)

Forcing lowercase:

In [261]:
df = df.applymap(lambda s: s.lower() if type(s) == str else s)



Eliminating duplicate entries:

In [262]:
df = df.drop_duplicates()

In [263]:
df.shape

(914, 20)

In [264]:
df["Engine_Type"].value_counts()

Engine_Type
kappa vtvt petrol engine       77
petrol engine                  41
k10b                           38
1.2l vvt engine                28
front wheel drive              26
                               ..
tdci                            1
1.5 l turbocharged revotorq     1
multijet/ddis                   1
1.5l petrol                     1
1.5 litre diesel engine         1
Name: count, Length: 131, dtype: int64

Removing all non-numeric characters from mileage and price columns

In [265]:
df['Mileage_Run'] = df['Mileage_Run'].str.replace(r'\D', '', regex=True)
df['Price'] = df['Price'].str.replace(r'\D', '', regex=True)



Removing year range from name field:

In [266]:
import re

def remove_date_name(row):
    # Split the car name field into individual words/phrases
    name_list = list(row["Car_Name"].split())

    # Create regex to match [2000-2018] type strings in name
    year_pattern = re.compile(r"\[(\d{4})-(\d{4})\]")
    year_match = re.search(year_pattern, row["Car_Name"])

    # Remove from name list if match found
    if year_match is not None and year_match.group(0) in name_list:
        name_list.remove(year_match.group(0))

    return " ".join(name_list)
    
df["Car_Name"] = df.apply(remove_date_name, axis=1)




In [267]:
import re

def clean_engine(df):
    engine_string = df["Engine_Type"]
    # Remove the pattern of number followed by the word "speed"
    engine_string = re.sub(r'\b\d+\s*speed\b', '', engine_string)

    # Now we split the string into a list of words
    engine_list = engine_string.split()

    # Define the list of unwanted words
    unwanted_words = ["petrol", "diesel", "cng", "lpg", "electric", "petrol+cng", "petrol+electric", "engine", "automatic", "manual", "transmission"]

    # Create a new list with only the words that are not in unwanted_words
    engine_list = [word for word in engine_list if word not in unwanted_words]

    # Join the words back together into a string
    new_engine = " ".join(engine_list)
    return new_engine

df["Engine_Type"] = df.apply(clean_engine, axis=1)


Inserting engine_litres columns:

In [268]:
df['engine_litres'] = None

2023-7-12 we have to chang ethis to split liters out into the new column from name or engine type. also update db schema

In [269]:

def move_liters(row):

    # Create a regular expression to match engine sizes in litres (e.g., 2.5l)
    liter_pattern = re.compile(r"\b\d+\.\d+[lL]\b")
    
    # Create a regular expression to match numeric values (e.g., 2.5)
    no_l_liter_pattern = re.compile(r"\b\d+\.\d+\b")

    fields = [row["Car_Name"], row["Engine_Type"]]

    for pattern in [liter_pattern, no_l_liter_pattern]:
        for field in fields:
            match = re.search(pattern, field)
            if match is not None:
                if pattern == liter_pattern:
                    return match.group(0)[:-1]
                elif pattern == no_l_liter_pattern:
                    return match.group(0)
    return None

df['engine_litres'] = df.apply(move_liters, axis=1)



More patterns to be removed left over from litre information:

In [270]:
# Regular expression patterns to be removed
patterns = [
    r"\b\d+\.\d+\s[lL]\b", # Matches '1.2 l' or '2.2 l' with a space before the 'l'
    r"\b\d+\.\d+[lL]?\b", # Matches '1.6l' or '1.9l' with no space before the 'l' and 'l' is optional
    r"( litre)|(-litre)", # Matches ' litre' or '-litre'
    r"gasoline", # Matches 'gasoline'
]

# Apply each pattern to each column
for column in ["Car_Name", "Engine_Type"]:
    for pattern in patterns:
        df[column] = df[column].apply(lambda x: re.sub(pattern, "", x))

Remove redundant words from car name field:

In [271]:
def clean_name(row):

    # Initialize a list to store the cleaned name
    new_name_list = list()

    # Split the name, engine type, car make and model into individual words
    name_list = row["Car_Name"].split()
    make_words = row["Make"].split()
    model_words = row["Model"].split()
    engine_type_words = row["Engine_Type"].split()

    # For each word in the car name, if it is not in the engine type, make, model or other row values, add it to the new name
    for word in name_list:
        if word not in engine_type_words and word not in model_words and word not in make_words:
            new_name_list.append(word)

    return " ".join(new_name_list)

df["Car_Name"] = df.apply(clean_name, axis=1)

Remove drive train info from engine type, put in new drive train column:

In [272]:
df["drive_train"] = None

In [273]:
def move_drive_train(row):
    # Create a regular expression to match drive train types
    drive_train_pattern = re.compile(r"\b\w+\s+wheel\s+drive\b")

    match = re.search(drive_train_pattern, row["Engine_Type"])
    if match is not None:
        return match.group(0)

df["drive_train"] = df.apply(move_drive_train, axis=1)



Remove drive train from engine type: 

In [274]:
drive_train_pattern = re.compile(r"\b\w+\s+wheel\s+drive\b")
df["Engine_Type"] = df["Engine_Type"].apply(lambda x: re.sub(drive_train_pattern, "", x))

In [275]:
df[["Engine_Type", "drive_train"]].head(50)

Unnamed: 0,Engine_Type,drive_train
0,tdi,
1,kappa,
2,i-vtec,
3,,
4,kappa vtvt,
5,kappa vtvt,
6,4 cylinder inline,
7,kryotec turbocharge,
8,kappa,
9,,


To standardize and constrain columns, I'll get all the unique values for relevant columns:

In [238]:
columns = ['Car_Name', 'Make', 'Model', 'Color', 'Body_Type','No_of_Owners','Fuel_Type', 'Engine_Type','Transmission','Transmission_Type', 'Emission']
for column in columns:
    print(df[column].value_counts())

Car_Name
vxi                             73
sportz                          36
asta                            28
vx                              25
sx                              25
                                ..
rxt mt                           1
sharp petrol turbo dct 6-str     1
titanium +                       1
magna corporate edition          1
flair edition diesel             1
Name: count, Length: 241, dtype: int64
Make
hyundai          293
maruti suzuki    285
honda             73
renault           66
ford              46
toyota            30
volkswagen        27
tata              23
mg motors         23
mahindra          18
kia                8
chevrolet          6
skoda              5
nissan             4
jeep               4
datsun             3
Name: count, dtype: int64
Model
elite i20        64
i10              36
grand i10        35
baleno           32
verna            32
                 ..
a-star            1
zest              1
corolla altis     1
safari        

Now I'll convert owner ordinals to an int:

In [276]:
def owner_to_int(df):
    if df['No_of_Owners'] == '1st':
        return 1
    elif df['No_of_Owners'] == '2nd':
        return 2
    elif df['No_of_Owners'] == '3rd':
        return 3
    elif df['No_of_Owners'] == '4th':
        return 4
    elif df['No_of_Owners'] == '5th':
        return 5


df['No_of_Owners'] = df.apply(owner_to_int, axis=1)



Now I'll convert emission type to 1 character:

In [277]:
def emission_to_int(df):
    if df['Emission'] == 'bs iv':
        return 4
    elif df['Emission'] == 'bs iii':
        return 3
    elif df['Emission'] == 'bs vi':
        return 6
    elif df['Emission'] == 'bs v':
        return 5
    elif df['Emission'] == 'bs ii':
        return 2

df['Emission'] = df.apply(emission_to_int, axis=1)

Now I will shorten fuel type to 1-2 characters 

In [278]:
def shorten_fuel_type(df):
    if df['Fuel_Type'] == 'diesel':
        return 'd'
    elif df['Fuel_Type'] == 'petrol':
        return 'p'
    elif df['Fuel_Type'] == 'petrol+cng':
        return 'pc'
    elif df['Fuel_Type'] == 'cng':
        return 'c'
    elif df['Fuel_Type'] == 'lpg':
        return 'l'
    elif df['Fuel_Type'] == 'electric':
        return 'e'
    elif df['Fuel_Type'] == 'petrol+electric':
        return 'pe'

df['Fuel_Type'] = df.apply(shorten_fuel_type, axis=1)

Now I will shorten the transmission type to 1 character:

In [279]:
def shorten_transmission_type(df):
    if df['Transmission_Type'] == 'manual':
        return 'm'
    elif df['Transmission_Type'] == 'automatic':
        return 'a'

df['Transmission_Type'] = df.apply(shorten_transmission_type, axis=1)

Now I will shorten the transmission gears field to 1 character:

In [280]:
def shorten_transmission_gears(df):
    if df['Transmission'] == '7-speed':
        return 7
    elif df['Transmission'] == '6-speed':
        return 6
    elif df['Transmission'] == '5-speed':
        return 5
    elif df['Transmission'] == '4-speed':
        return 4
    elif df['Transmission'] == 'cvt':
        return "c"

df['Transmission'] = df.apply(shorten_transmission_gears, axis=1)

In [292]:
df["Engine_Type"] = df["Engine_Type"].str.rstrip()
df["Engine_Type"] = df["Engine_Type"].str.lstrip()
# Strip commas
df["Engine_Type"] = df["Engine_Type"].str.replace(",", "")
df["Engine_Type"] = df["Engine_Type"].str.replace("  ", " ")

In [293]:
import pandas as pd

# Set the max number of rows to None (no limit)
pd.set_option('display.max_rows', None)

print(df["Engine_Type"].value_counts())

Engine_Type
                                                                                                   119
kappa vtvt                                                                                          77
k10b                                                                                                60
i-vtec                                                                                              33
vvt                                                                                                 33
kappa                                                                                               32
f8d                                                                                                 25
k series                                                                                            24
fwd                                                                                                 23
mpi                                                          

In [245]:
pip install python-dotenv



You should consider upgrading via the '/Users/bfaris96/Desktop/turing-proj/cars_db/venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [246]:
pip install sqlalchemy

You should consider upgrading via the '/Users/bfaris96/Desktop/turing-proj/cars_db/venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [247]:
from sqlalchemy import create_engine
import psycopg2
import os
from dotenv import load_dotenv

load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

conn = psycopg2.connect(f"host=localhost dbname=car_db user={DB_USER} password={DB_PASSWORD}")

I have to just use strings for most of these fields right now because they are so drastically denormalized and inconsistent. I'm not going to put in a primary key yet, bc I want to delete duplicates first. 

In [248]:
try:
    cur = conn.cursor()
    cur.execute("""DROP TABLE IF EXISTS cars""")
    cur.execute("""CREATE TABLE cars(
        name VARCHAR(255),
        make VARCHAR(64),
        model VARCHAR(64),
        year DATE,
        color VARCHAR(64),
        body_style VARCHAR(64),
        mileage INT,
        num_owners INT,
        seating_capacity INT,
        fuel_type VARCHAR(8),
        fuel_capacity INT,
        engine_type VARCHAR(255),
        cc_displacement INT,
        transmission_gears VARCHAR(3),
        transmission_type VARCHAR(3),
        bhp FLOAT,
        torque FLOAT,
        fuel_economy FLOAT,
        emission_class INT,
        price INT)
    """)
except Exception as e:
    print("An error occurred:", e)
    conn.rollback()  # rollback transaction
else:
    conn.commit()  # commit transaction

cur.close()


Loading df into postgres

In [249]:
# rename dataframe columns to match with the table column names
df = df.rename(columns={
    'Car_Name': 'name',
    'Make': 'make',
    'Model': 'model',
    'Make_Year': 'year',
    'Color': 'color',
    'Body_Type': 'body_style',
    'Mileage_Run': 'mileage',
    'No_of_Owners': 'num_owners',
    'Seating_Capacity': 'seating_capacity',
    'Fuel_Type': 'fuel_type',
    'Fuel_Tank_Capacity(L)': 'fuel_capacity',
    'Engine_Type': 'engine_type',
    'CC_Displacement': 'cc_displacement',
    'Transmission': 'transmission_gears',
    'Transmission_Type': 'transmission_type',
    'Power(BHP)': 'bhp',
    'Torque(Nm)': 'torque',
    'Mileage(kmpl)': 'fuel_economy',
    'Emission': 'emission_class',
    'Price': 'price'
})

# create sqlalchemy engine
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@localhost/car_db")

# Use 'replace' if you want to replace an existing table with the same name.
df.to_sql('cars', engine, if_exists='replace', index=False)


914

In [250]:
from sqlalchemy import create_engine, text
# Confirm that the data has been loaded successfully
with engine.connect() as conn:
    result_set = conn.execute(text("SELECT * FROM cars LIMIT 10"))
    for row in result_set:
        print(row)

('highline at (d)', 'volkswagen', 'ameo', 2017, 'silver', 'sedan', '44611', None, 5, None, 45, ' tdi', 1498, None, None, 109.0, 250.0, '21.66', None, '657000', '1.5', None)
('sx', 'hyundai', 'i20 active', 2016, 'red', 'crossover', '20305', None, 5, None, 45, ' kappa', 1197, None, None, 82.0, 115.0, '17.19', None, '682000', '1.2', None)
('vx', 'honda', 'wr-v', 2019, 'white', 'suv', '29540', None, 5, None, 40, 'i-vtec', 1199, None, None, 88.5, 110.0, '16.5', None, '793000', None, None)
('rxt amt', 'renault', 'kwid', 2017, 'bronze', 'hatchback', '35680', None, 5, None, 28, '', 999, None, None, 67.0, 91.0, '21.7', None, '414000', '1.0', None)
('asta', 'hyundai', 'grand i10', 2017, 'orange', 'hatchback', '25126', None, 5, None, 43, 'kappa vtvt', 1197, None, None, 81.86, 113.75, '18.9', None, '515000', '1.2', None)
('sportz', 'hyundai', 'elite i20', 2016, 'red', 'hatchback', '52261', None, 5, None, 45, 'kappa vtvt', 1197, None, None, 81.83, 114.7, '18.6', None, '604000', '1.2', None)
('v mt'

In [251]:
with engine.connect() as conn:
    conn.execute(text("CREATE EXTENSION IF NOT EXISTS pgcrypto"))
    conn.execute(text("ALTER TABLE cars ADD COLUMN id UUID PRIMARY KEY DEFAULT uuid_generate_v4()"))
    conn.commit()



In [252]:
with engine.connect() as conn:
    result_set = conn.execute(text("SELECT * FROM cars LIMIT 10"))
    for row in result_set:
        print(row)

('highline at (d)', 'volkswagen', 'ameo', 2017, 'silver', 'sedan', '44611', None, 5, None, 45, ' tdi', 1498, None, None, 109.0, 250.0, '21.66', None, '657000', '1.5', None, UUID('a4fa0409-3225-41ae-9459-229ce61f6e3f'))
('sx', 'hyundai', 'i20 active', 2016, 'red', 'crossover', '20305', None, 5, None, 45, ' kappa', 1197, None, None, 82.0, 115.0, '17.19', None, '682000', '1.2', None, UUID('61c0f1fa-b369-492a-b205-88abec0ce03e'))
('vx', 'honda', 'wr-v', 2019, 'white', 'suv', '29540', None, 5, None, 40, 'i-vtec', 1199, None, None, 88.5, 110.0, '16.5', None, '793000', None, None, UUID('3ce0d3fa-f078-4634-8b90-72552ef76c66'))
('rxt amt', 'renault', 'kwid', 2017, 'bronze', 'hatchback', '35680', None, 5, None, 28, '', 999, None, None, 67.0, 91.0, '21.7', None, '414000', '1.0', None, UUID('edcc71ad-1cea-4e45-84a0-5a54cfbdc63f'))
('asta', 'hyundai', 'grand i10', 2017, 'orange', 'hatchback', '25126', None, 5, None, 43, 'kappa vtvt', 1197, None, None, 81.86, 113.75, '18.9', None, '515000', '1.2', N

2023-7-11

We're pretty close on the DB just need to rename columns, validate data types, and put in ID columns

1. The db is in 1NF because each cell contains a single value

* ACTUALLY THIS IS NOT TRUE. YOU NEED TO SPLIT OUT LITERS