# I will ingest a CSV into postgresql and normalize it.
# Or will I normalize it partly in python and then ingest and do finishing touches?
First creating and initiating our venv:

In [None]:
python3 -m venv venv


In [None]:
. venv/bin/activate

Let's ingest to a df

In [None]:
pip install pandas

In [171]:
import pandas as pd

df = pd.read_csv("/Users/bfaris96/Desktop/turing-proj/cars_db/FINAL_SPINNY_900.csv")

In [172]:
df.shape

(976, 20)

Checking for any nulls:

In [173]:
print(df[df.isnull().any(axis=1)])

Empty DataFrame
Columns: [Car_Name, Make, Model, Make_Year, Color, Body_Type, Mileage_Run, No_of_Owners, Seating_Capacity, Fuel_Type, Fuel_Tank_Capacity(L), Engine_Type, CC_Displacement, Transmission, Transmission_Type, Power(BHP), Torque(Nm), Mileage(kmpl), Emission, Price]
Index: []


Stripping extra whitespace:

In [174]:
df = df.applymap(lambda x: ' '.join(x.split()) if isinstance(x, str) else x)

In [175]:
df = df.applymap(lambda s: s.lower() if type(s) == str else s)



Eliminating duplicate entries:

In [176]:
df = df.drop_duplicates()

In [177]:
df.shape

(914, 20)

Removing all non-numeric characters from mileage and price columns

In [178]:
df['Mileage_Run'] = df['Mileage_Run'].str.replace(r'\D', '', regex=True)
df['Price'] = df['Price'].str.replace(r'\D', '', regex=True)



In [179]:
import re

def clean_name(row):
    # Split the car name field into individual words/phrases
    name_list = list(row["Car_Name"].split())

    # Create regex to match [2000-2018] type strings in name
    year_pattern = re.compile(r"\[(\d{4})-(\d{4})\]")
    year_match = re.search(year_pattern, row["Car_Name"])

    # Remove from name list if match found
    if year_match is not None and year_match.group(0) in name_list:
        name_list.remove(year_match.group(0))
    
    # Create a regular expression to match engine sizes in litres (e.g., 2.5l)
    liter_pattern = re.compile(r"\b\d+\.\d+[lL]\b")
    
    # Create a regular expression to match numeric values (e.g., 2.5)
    no_l_liter_pattern = re.compile(r"\b\d+\.\d+\b")
    
    # Search the car name for matches to the numeric regular expression
    no_l_liter_match = re.search(no_l_liter_pattern, row["Car_Name"])
    
    # Initialize a list to store the cleaned name
    new_name_list = list()

    # Split the engine type into individual words
    engine_type_words = row["Engine_Type"].split()
    
    # Search the engine type for matches to the engine size regular expression
    liter_match = re.search(liter_pattern, row["Engine_Type"])
    
    # If a match was found, add it to the engine type words (minus the 'l' character)
    if liter_match is not None:
        engine_type_words.append(liter_match.group(0)[:-1])

    # Split the car make and model into individual words
    make_words = row["Make"].split()
    model_words = row["Model"].split()
    
    # For each word in the car name, if it is not in the engine type, make, model or other row values, add it to the new name
    for word in name_list:
        if word not in row.values[1:] and word not in engine_type_words and word not in model_words and word not in make_words:
            new_name_list.append(word)

    # Search the cleaned car name for numeric matches
    no_l_liter_match = re.search(no_l_liter_pattern, row["Car_Name"])

    # Remove the numeric match from the cleaned name
    if no_l_liter_match is not None and no_l_liter_match.group(0) in new_name_list:
        new_name_list.remove(no_l_liter_match.group(0))
    
    # Join the cleaned name words into a single string and assign it back to the row's car name
    new_name = " ".join(new_name_list)

    # Create a regular expression to match long litre labels (e.g., 2.5-litre)
    long_liter_pattern = re.compile(r"\b\d+\.\d+\b-litre")
    
    # Search the engine type for long litre label matches
    long_liter_match = re.search(long_liter_pattern, row["Engine_Type"])

    # If a numeric match was found in the car name
    if no_l_liter_match is not None and no_l_liter_match.group(0) not in engine_type_words and f"{no_l_liter_match.group(0)}l" not in engine_type_words:
        # Add the match (with an 'l' appended) to the engine type words
        engine_type_words.append(f"{no_l_liter_match.group(0)}l")
        
        # If a long litre label match was found in the engine type, remove it
        if long_liter_match is not None:
            engine_type_words.remove(long_liter_match.group(0))
        
        # Join the engine type words into a single string and assign it back to the row's engine type
        new_engine_type = " ".join(engine_type_words)
    else:
        new_engine_type = row["Engine_Type"]
    return new_name, new_engine_type

# Apply the clean_name function to each row of the DataFrame
df["Car_Name"], df["Engine_Type"] = zip(*df.apply(clean_name, axis=1))


In [180]:
import re

def clean_engine(df):
    engine_string = df["Engine_Type"]
    # Remove the pattern of number followed by the word "speed"
    engine_string = re.sub(r'\b\d+\s*speed\b', '', engine_string)

    # Now we split the string into a list of words
    engine_list = engine_string.split()

    # Define the list of unwanted words
    unwanted_words = ["petrol", "diesel", "cng", "lpg", "electric", "petrol+cng", "petrol+electric", "engine", "automatic", "manual", "transmission"]

    # Create a new list with only the words that are not in unwanted_words
    engine_list = [word for word in engine_list if word not in unwanted_words]

    # Join the words back together into a string
    new_engine = " ".join(engine_list)
    return new_engine

df["Engine_Type"] = df.apply(clean_engine, axis=1)


To standardize and constrain columns, I'll get all the unique values for relevant columns:

In [181]:
columns = ['Car_Name', 'Make', 'Model', 'Color', 'Body_Type','No_of_Owners','Fuel_Type', 'Engine_Type','Transmission','Transmission_Type', 'Emission']
for column in columns:
    print(df[column].value_counts())

Car_Name
vxi                      73
sportz                   51
sx                       30
asta                     28
alpha                    27
                         ..
rxz turbo cvt             1
limited                   1
rxt mt                    1
sharp turbo dct 6-str     1
flair edition             1
Name: count, Length: 218, dtype: int64
Make
hyundai          293
maruti suzuki    285
honda             73
renault           66
ford              46
toyota            30
volkswagen        27
tata              23
mg motors         23
mahindra          18
kia                8
chevrolet          6
skoda              5
nissan             4
jeep               4
datsun             3
Name: count, dtype: int64
Model
elite i20        64
i10              36
grand i10        35
baleno           32
verna            32
                 ..
a-star            1
zest              1
corolla altis     1
safari            1
hector plus       1
Name: count, Length: 87, dtype: int64
Color
white  

Now I'll convert owner ordinals to an int:

In [182]:
def owner_to_int(df):
    if df['No_of_Owners'] == '1st':
        return 1
    elif df['No_of_Owners'] == '2nd':
        return 2
    elif df['No_of_Owners'] == '3rd':
        return 3
    elif df['No_of_Owners'] == '4th':
        return 4
    elif df['No_of_Owners'] == '5th':
        return 5


df['No_of_Owners'] = df.apply(owner_to_int, axis=1)



Now I'll convert emission type to 1 character:

In [183]:
def emission_to_int(df):
    if df['Emission'] == 'bs iv':
        return 4
    elif df['Emission'] == 'bs iii':
        return 3
    elif df['Emission'] == 'bs vi':
        return 6
    elif df['Emission'] == 'bs v':
        return 5
    elif df['Emission'] == 'bs ii':
        return 2

df['Emission'] = df.apply(emission_to_int, axis=1)

Now I will shorten fuel type to 1-2 characters 

In [184]:
def shorten_fuel_type(df):
    if df['Fuel_Type'] == 'diesel':
        return 'd'
    elif df['Fuel_Type'] == 'petrol':
        return 'p'
    elif df['Fuel_Type'] == 'petrol+cng':
        return 'pc'
    elif df['Fuel_Type'] == 'cng':
        return 'c'
    elif df['Fuel_Type'] == 'lpg':
        return 'l'
    elif df['Fuel_Type'] == 'electric':
        return 'e'
    elif df['Fuel_Type'] == 'petrol+electric':
        return 'pe'

df['Fuel_Type'] = df.apply(shorten_fuel_type, axis=1)

Now I will shorten the transmission type to 1 character:

In [185]:
def shorten_transmission_type(df):
    if df['Transmission_Type'] == 'manual':
        return 'm'
    elif df['Transmission_Type'] == 'automatic':
        return 'a'

df['Transmission_Type'] = df.apply(shorten_transmission_type, axis=1)

Now I will shorten the transmission gears field to 1 character:

In [186]:
def shorten_transmission_gears(df):
    if df['Transmission'] == '7-speed':
        return 7
    elif df['Transmission'] == '6-speed':
        return 6
    elif df['Transmission'] == '5-speed':
        return 5
    elif df['Transmission'] == '4-speed':
        return 4
    elif df['Transmission'] == 'cvt':
        return "c"

df['Transmission'] = df.apply(shorten_transmission_gears, axis=1)

In [187]:
df.head(40)

Unnamed: 0,Car_Name,Make,Model,Make_Year,Color,Body_Type,Mileage_Run,No_of_Owners,Seating_Capacity,Fuel_Type,Fuel_Tank_Capacity(L),Engine_Type,CC_Displacement,Transmission,Transmission_Type,Power(BHP),Torque(Nm),Mileage(kmpl),Emission,Price
0,highline at (d),volkswagen,ameo,2017,silver,sedan,44611,1,5,d,45,1.5l tdi,1498,7,a,109.0,250.0,21.66,4,657000
1,sx,hyundai,i20 active,2016,red,crossover,20305,1,5,p,45,1.2l kappa,1197,5,m,82.0,115.0,17.19,5,682000
2,vx,honda,wr-v,2019,white,suv,29540,2,5,p,40,i-vtec,1199,5,m,88.5,110.0,16.5,4,793000
3,rxt amt,renault,kwid,2017,bronze,hatchback,35680,1,5,p,28,1.0l,999,5,m,67.0,91.0,21.7,4,414000
4,asta,hyundai,grand i10,2017,orange,hatchback,25126,1,5,p,43,kappa vtvt 1.2l,1197,5,m,81.86,113.75,18.9,5,515000
5,sportz,hyundai,elite i20,2016,red,hatchback,52261,1,5,p,45,kappa vtvt 1.2l,1197,5,m,81.83,114.7,18.6,4,604000
6,v mt,honda,brio,2012,grey,hatchback,28108,2,5,p,35,4 cylinder inline,1198,5,m,86.8,109.0,19.4,3,316000
7,xz,tata,harrier,2019,grey,suv,92603,1,5,d,50,kryotec 2.0 l turbocharge,1956,6,a,138.0,350.0,17.0,4,1419000
8,sportz amt vtvt,hyundai,grand i10 nios,2021,blue,hatchback,16304,1,5,p,37,1.2 l kappa,1197,5,m,81.86,113.75,20.07,4,710000
9,rxt opt,renault,kwid,2019,bronze,hatchback,26350,2,5,p,28,1.0l,999,5,m,67.0,91.0,22.0,4,392000


In [188]:
pip install python-dotenv

You should consider upgrading via the '/Users/bfaris96/Desktop/turing-proj/cars_db/venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [189]:
pip install sqlalchemy

You should consider upgrading via the '/Users/bfaris96/Desktop/turing-proj/cars_db/venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [190]:
from sqlalchemy import create_engine
import psycopg2
import os
from dotenv import load_dotenv

load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

conn = psycopg2.connect(f"host=localhost dbname=car_db user={DB_USER} password={DB_PASSWORD}")

I have to just use strings for most of these fields right now because they are so drastically denormalized and inconsistent. I'm not going to put in a primary key yet, bc I want to delete duplicates first. 

In [191]:
try:
    cur = conn.cursor()
    cur.execute("""DROP TABLE IF EXISTS cars""")
    cur.execute("""CREATE TABLE cars(
        name VARCHAR(255),
        make VARCHAR(64),
        model VARCHAR(64),
        year DATE,
        color VARCHAR(64),
        body_style VARCHAR(64),
        mileage INT,
        num_owners INT,
        seating_capacity INT,
        fuel_type VARCHAR(8),
        fuel_capacity INT,
        engine_type VARCHAR(255),
        cc_displacement INT,
        transmission_gears VARCHAR(3),
        transmission_type VARCHAR(3),
        bhp FLOAT,
        torque FLOAT,
        fuel_economy FLOAT,
        emission_class INT,
        price INT)
    """)
except Exception as e:
    print("An error occurred:", e)
    conn.rollback()  # rollback transaction
else:
    conn.commit()  # commit transaction

cur.close()


Loading df into postgres

In [192]:
# rename dataframe columns to match with the table column names
df = df.rename(columns={
    'Car_Name': 'name',
    'Make': 'make',
    'Model': 'model',
    'Make_Year': 'year',
    'Color': 'color',
    'Body_Type': 'body_style',
    'Mileage_Run': 'mileage',
    'No_of_Owners': 'num_owners',
    'Seating_Capacity': 'seating_capacity',
    'Fuel_Type': 'fuel_type',
    'Fuel_Tank_Capacity(L)': 'fuel_capacity',
    'Engine_Type': 'engine_type',
    'CC_Displacement': 'cc_displacement',
    'Transmission': 'transmission_gears',
    'Transmission_Type': 'transmission_type',
    'Power(BHP)': 'bhp',
    'Torque(Nm)': 'torque',
    'Mileage(kmpl)': 'fuel_economy',
    'Emission': 'emission_class',
    'Price': 'price'
})

# create sqlalchemy engine
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@localhost/car_db")

# Use 'replace' if you want to replace an existing table with the same name.
df.to_sql('cars', engine, if_exists='replace', index=False)


914

In [193]:
from sqlalchemy import create_engine, text
# Confirm that the data has been loaded successfully
with engine.connect() as conn:
    result_set = conn.execute(text("SELECT * FROM cars LIMIT 10"))
    for row in result_set:
        print(row)

('highline at (d)', 'volkswagen', 'ameo', 2017, 'silver', 'sedan', '44611', 1, 5, 'd', 45, '1.5l tdi', 1498, '7', 'a', 109.0, 250.0, '21.66', 4, '657000')
('sx', 'hyundai', 'i20 active', 2016, 'red', 'crossover', '20305', 1, 5, 'p', 45, '1.2l kappa', 1197, '5', 'm', 82.0, 115.0, '17.19', 5, '682000')
('vx', 'honda', 'wr-v', 2019, 'white', 'suv', '29540', 2, 5, 'p', 40, 'i-vtec', 1199, '5', 'm', 88.5, 110.0, '16.5', 4, '793000')
('rxt amt', 'renault', 'kwid', 2017, 'bronze', 'hatchback', '35680', 1, 5, 'p', 28, '1.0l', 999, '5', 'm', 67.0, 91.0, '21.7', 4, '414000')
('asta', 'hyundai', 'grand i10', 2017, 'orange', 'hatchback', '25126', 1, 5, 'p', 43, 'kappa vtvt 1.2l', 1197, '5', 'm', 81.86, 113.75, '18.9', 5, '515000')
('sportz', 'hyundai', 'elite i20', 2016, 'red', 'hatchback', '52261', 1, 5, 'p', 45, 'kappa vtvt 1.2l', 1197, '5', 'm', 81.83, 114.7, '18.6', 4, '604000')
('v mt', 'honda', 'brio', 2012, 'grey', 'hatchback', '28108', 2, 5, 'p', 35, '4 cylinder inline', 1198, '5', 'm', 86

In [200]:
with engine.connect() as conn:
    conn.execute(text("CREATE EXTENSION IF NOT EXISTS pgcrypto"))
    conn.execute(text("ALTER TABLE cars ADD COLUMN id UUID PRIMARY KEY DEFAULT uuid_generate_v4()"))
    conn.commit()



In [201]:
with engine.connect() as conn:
    result_set = conn.execute(text("SELECT * FROM cars LIMIT 10"))
    for row in result_set:
        print(row)

('highline at (d)', 'volkswagen', 'ameo', 2017, 'silver', 'sedan', '44611', 1, 5, 'd', 45, '1.5l tdi', 1498, '7', 'a', 109.0, 250.0, '21.66', 4, '657000', UUID('1018ce76-de85-442c-b1d7-9d5955eb1a27'))
('sx', 'hyundai', 'i20 active', 2016, 'red', 'crossover', '20305', 1, 5, 'p', 45, '1.2l kappa', 1197, '5', 'm', 82.0, 115.0, '17.19', 5, '682000', UUID('82fa9810-f5eb-469d-a391-50590e65e7a7'))
('vx', 'honda', 'wr-v', 2019, 'white', 'suv', '29540', 2, 5, 'p', 40, 'i-vtec', 1199, '5', 'm', 88.5, 110.0, '16.5', 4, '793000', UUID('cec4c15c-02e1-4ae7-bd71-b9b52ac01407'))
('rxt amt', 'renault', 'kwid', 2017, 'bronze', 'hatchback', '35680', 1, 5, 'p', 28, '1.0l', 999, '5', 'm', 67.0, 91.0, '21.7', 4, '414000', UUID('901019da-8b6b-4dca-888e-b47f13be6680'))
('asta', 'hyundai', 'grand i10', 2017, 'orange', 'hatchback', '25126', 1, 5, 'p', 43, 'kappa vtvt 1.2l', 1197, '5', 'm', 81.86, 113.75, '18.9', 5, '515000', UUID('06f01826-0c51-4fcb-af39-c6c08d30f072'))
('sportz', 'hyundai', 'elite i20', 2016, 

2023-7-11

We're pretty close on the DB just need to rename columns, validate data types, and put in ID columns

1. The db is in 1NF because each cell contains a single value

* ACTUALLY THIS IS NOT TRUE. YOU NEED TO SPLIT OUT LITERS