# I will ingest a CSV into postgresql and normalize it.
# Or will I normalize it partly in python and then ingest and do finishing touches?
First creating and initiating our venv:

In [170]:
# python3 -m venv venv


In [171]:
# . venv/bin/activate

Let's ingest to a df

In [172]:
# pip install pandas

In [173]:
import pandas as pd

df = pd.read_csv("/Users/bfaris96/Desktop/turing-proj/cars_db/FINAL_SPINNY_900.csv")

In [174]:
df.shape

(976, 20)

In [175]:
# df.head(15)

Checking for any nulls:

In [176]:
print(df[df.isnull().any(axis=1)])

Empty DataFrame
Columns: [Car_Name, Make, Model, Make_Year, Color, Body_Type, Mileage_Run, No_of_Owners, Seating_Capacity, Fuel_Type, Fuel_Tank_Capacity(L), Engine_Type, CC_Displacement, Transmission, Transmission_Type, Power(BHP), Torque(Nm), Mileage(kmpl), Emission, Price]
Index: []


Stripping extra whitespace:

In [177]:
df = df.applymap(lambda x: ' '.join(x.split()) if isinstance(x, str) else x)

Forcing lowercase:

In [178]:
df = df.applymap(lambda s: s.lower() if type(s) == str else s)



Eliminating duplicate entries:

In [179]:
df = df.drop_duplicates()

In [180]:
df.shape

(914, 20)

In [181]:
df["Engine_Type"].value_counts()

Engine_Type
kappa vtvt petrol engine                                                                                       77
petrol engine                                                                                                  41
k10b                                                                                                           38
1.2l vvt engine                                                                                                28
front wheel drive                                                                                              26
k series petrol engine                                                                                         24
fwd                                                                                                            23
1.2 litre petrol engine                                                                                        23
i-vtec petrol engine                                                        

Removing all non-numeric characters from mileage and price columns

In [182]:
df['Mileage_Run'] = df['Mileage_Run'].str.replace(r'\D', '', regex=True)
df['Price'] = df['Price'].str.replace(r'\D', '', regex=True)



Removing year range from name field:

In [183]:
import re

def remove_date_name(row):
    # Split the car name field into individual words/phrases
    name_list = list(row["Car_Name"].split())

    # Create regex to match [2000-2018] type strings in name
    year_pattern = re.compile(r"\[(\d{4})-(\d{4})\]")
    year_match = re.search(year_pattern, row["Car_Name"])

    # Remove from name list if match found
    if year_match is not None and year_match.group(0) in name_list:
        name_list.remove(year_match.group(0))

    return " ".join(name_list)
    
df["Car_Name"] = df.apply(remove_date_name, axis=1)




In [184]:
import re

def clean_engine(df):
    engine_string = df["Engine_Type"]
    # Remove the pattern of number followed by the word "speed"
    engine_string = re.sub(r'\b\d+\s*speed\b', '', engine_string)

    # Now we split the string into a list of words
    engine_list = engine_string.split()

    # Define the list of unwanted words
    unwanted_words = ["petrol", "(petrol)", "diesel", "(diesel)", "cng", "(cng)", "lpg", "electric", "petrol+cng", "petrol+electric", "engine", "automatic", "manual", "transmission"]

    # Create a new list with only the words that are not in unwanted_words
    engine_list = [word for word in engine_list if word not in unwanted_words]

    # Join the words back together into a string
    new_engine = " ".join(engine_list)
    return new_engine

df["Engine_Type"] = df.apply(clean_engine, axis=1)


Inserting engine_litres columns:

In [185]:
df['engine_litres'] = None

2023-7-12 we have to chang ethis to split liters out into the new column from name or engine type. also update db schema

In [186]:

def move_liters(row):

    # Create a regular expression to match engine sizes in litres (e.g., 2.5l)
    liter_pattern = re.compile(r"\b\d+\.\d+[lL]\b")
    
    # Create a regular expression to match numeric values (e.g., 2.5)
    no_l_liter_pattern = re.compile(r"\b\d+\.\d+\b")

    fields = [row["Car_Name"], row["Engine_Type"]]

    for pattern in [liter_pattern, no_l_liter_pattern]:
        for field in fields:
            match = re.search(pattern, field)
            if match is not None:
                if pattern == liter_pattern:
                    return match.group(0)[:-1]
                elif pattern == no_l_liter_pattern:
                    return match.group(0)
    return None

df['engine_litres'] = df.apply(move_liters, axis=1)



More patterns to be removed left over from litre information:

In [187]:
# Regular expression patterns to be removed
patterns = [
    r"\b\d+\.\d+\s[lL]\b", # Matches '1.2 l' or '2.2 l' with a space before the 'l'
    r"\b\d+\.\d+[lL]?\b", # Matches '1.6l' or '1.9l' with no space before the 'l' and 'l' is optional
    r"( litre)|(-litre)", # Matches ' litre' or '-litre'
    r"gasoline", # Matches 'gasoline'
]

# Apply each pattern to each column
for column in ["Car_Name", "Engine_Type"]:
    for pattern in patterns:
        df[column] = df[column].apply(lambda x: re.sub(pattern, "", x))

Remove redundant words from car name field:

In [189]:
def clean_name(row):

    # Initialize a list to store the cleaned name
    new_name_list = list()

    # Split the name, engine type, car make and model into individual words
    name_list = row["Car_Name"].split()
    make_words = row["Make"].split()
    model_words = row["Model"].split()
    engine_type_words = row["Engine_Type"].split()

    # For each word in the car name, if it is not in the engine type, make, model or other row values, add it to the new name
    for word in name_list:
        if word not in engine_type_words and word not in model_words and word not in make_words and word not in row["Fuel_Type"]:
            new_name_list.append(word)

    return " ".join(new_name_list)

df["Car_Name"] = df.apply(clean_name, axis=1)

Remove drive train info from engine type, put in new drive train column:

In [190]:
df["drive_train"] = None

In [191]:
def move_drive_train(row):
    # Create a regular expression to match drive train types
    drive_train_pattern = re.compile(r"\b\w+\s+wheel\s+drive\b")

    match = re.search(drive_train_pattern, row["Engine_Type"])
    if match is not None:
        return match.group(0)

df["drive_train"] = df.apply(move_drive_train, axis=1)



Remove drive train from engine type: 

In [192]:
drive_train_pattern = re.compile(r"\b\w+\s+wheel\s+drive\b")
df["Engine_Type"] = df["Engine_Type"].apply(lambda x: re.sub(drive_train_pattern, "", x))

To standardize and constrain columns, I'll get all the unique values for relevant columns:

In [193]:
columns = ['Car_Name', 'Make', 'Model', 'Color', 'Body_Type','No_of_Owners','Fuel_Type', 'Engine_Type','Transmission','Transmission_Type', 'Emission']
for column in columns:
    print(df[column].value_counts())

Car_Name
vxi                           74
sportz                        51
sx                            30
asta                          28
alpha                         27
vx                            25
titanium                      23
magna                         23
vxi amt                       19
v                             17
s                             16
asta (o)                      16
lxi                           13
sharp                         12
vx cvt                        12
zxi                           11
rxt opt                       11
zxi amt                       11
sx plus auto                   9
sx plus                        9
zxi plus                       9
vdi                            9
g                              8
sharp dct at                   8
rxz                            8
vxi+                           8
zeta                           8
highline1.2l (p)               8
sx (o)                         7
vxi (o)                        7
a

Now I'll convert owner ordinals to an int:

In [194]:
def owner_to_int(df):
    if df['No_of_Owners'] == '1st':
        return 1
    elif df['No_of_Owners'] == '2nd':
        return 2
    elif df['No_of_Owners'] == '3rd':
        return 3
    elif df['No_of_Owners'] == '4th':
        return 4
    elif df['No_of_Owners'] == '5th':
        return 5


df['No_of_Owners'] = df.apply(owner_to_int, axis=1)



Now I'll convert emission type to 1 character:

In [195]:
def emission_to_int(df):
    if df['Emission'] == 'bs iv':
        return 4
    elif df['Emission'] == 'bs iii':
        return 3
    elif df['Emission'] == 'bs vi':
        return 6
    elif df['Emission'] == 'bs v':
        return 5
    elif df['Emission'] == 'bs ii':
        return 2

df['Emission'] = df.apply(emission_to_int, axis=1)

Now I will shorten fuel type to 1-2 characters 

In [196]:
def shorten_fuel_type(df):
    if df['Fuel_Type'] == 'diesel':
        return 'd'
    elif df['Fuel_Type'] == 'petrol':
        return 'p'
    elif df['Fuel_Type'] == 'petrol+cng':
        return 'pc'
    elif df['Fuel_Type'] == 'cng':
        return 'c'
    elif df['Fuel_Type'] == 'lpg':
        return 'l'
    elif df['Fuel_Type'] == 'electric':
        return 'e'
    elif df['Fuel_Type'] == 'petrol+electric':
        return 'pe'

df['Fuel_Type'] = df.apply(shorten_fuel_type, axis=1)

Now I will shorten the transmission type to 1 character:

In [197]:
def shorten_transmission_type(df):
    if df['Transmission_Type'] == 'manual':
        return 'm'
    elif df['Transmission_Type'] == 'automatic':
        return 'a'

df['Transmission_Type'] = df.apply(shorten_transmission_type, axis=1)

Now I will shorten the transmission gears field to 1 character:

In [198]:
def shorten_transmission_gears(df):
    if df['Transmission'] == '7-speed':
        return 7
    elif df['Transmission'] == '6-speed':
        return 6
    elif df['Transmission'] == '5-speed':
        return 5
    elif df['Transmission'] == '4-speed':
        return 4
    elif df['Transmission'] == 'cvt':
        return "c"

df['Transmission'] = df.apply(shorten_transmission_gears, axis=1)

Shorten the drive train field:

In [199]:
def shorten_drive_train(df):
    if df['drive_train'] == 'front wheel drive':
        return 'fwd'
    elif df['drive_train'] == 'rear wheel drive':
        return 'rwd'
    elif df['drive_train'] == 'all wheel drive':
        return 'awd'
    elif df['drive_train'] == 'four wheel drive':
        return '4wd'
    
df['drive_train'] = df.apply(shorten_drive_train, axis=1)

Strip whitespace and commas:

In [200]:
df["Engine_Type"] = df["Engine_Type"].str.rstrip()
df["Engine_Type"] = df["Engine_Type"].str.lstrip()
# Strip commas
df["Engine_Type"] = df["Engine_Type"].str.replace(",", "")
df["Engine_Type"] = df["Engine_Type"].str.replace("  ", " ")

In [201]:
import pandas as pd

# Set the max number of rows to None (no limit)
pd.set_option('display.max_rows', None)

for column in df.columns:
    print(f"\nValue counts for {column}:")
    print(df[column].value_counts())


Value counts for Car_Name:
Car_Name
vxi                           74
sportz                        51
sx                            30
asta                          28
alpha                         27
vx                            25
titanium                      23
magna                         23
vxi amt                       19
v                             17
s                             16
asta (o)                      16
lxi                           13
sharp                         12
vx cvt                        12
zxi                           11
rxt opt                       11
zxi amt                       11
sx plus auto                   9
sx plus                        9
zxi plus                       9
vdi                            9
g                              8
sharp dct at                   8
rxz                            8
vxi+                           8
zeta                           8
highline1.2l (p)               8
sx (o)                         7
vxi (o

In [202]:
df[df["Mileage(kmpl)"] == "bs iv"]

Unnamed: 0,Car_Name,Make,Model,Make_Year,Color,Body_Type,Mileage_Run,No_of_Owners,Seating_Capacity,Fuel_Type,...,CC_Displacement,Transmission,Transmission_Type,Power(BHP),Torque(Nm),Mileage(kmpl),Emission,Price,engine_litres,drive_train
322,alpha,maruti suzuki,baleno,2019,blue,hatchback,30420,1,5,d,...,1248,5,a,74.0,190.0,bs iv,5,883000,,
446,alpha,maruti suzuki,baleno,2019,blue,hatchback,37942,1,5,d,...,1248,5,a,74.0,190.0,bs iv,5,847000,,


In [203]:
# pip install python-dotenv

In [204]:
# pip install sqlalchemy

In [205]:
from sqlalchemy import create_engine
import psycopg2
import os
from dotenv import load_dotenv

load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

conn = psycopg2.connect(f"host=localhost dbname=car_db user={DB_USER} password={DB_PASSWORD}")

Changing df col names to be more apporpriate for db:

In [206]:
# rename dataframe columns to match with the table column names
df = df.rename(columns={
    'Car_Name': 'name',
    'Make': 'make',
    'Model': 'model',
    'Make_Year': 'year',
    'Color': 'color',
    'Body_Type': 'body_style',
    'Mileage_Run': 'mileage',
    'No_of_Owners': 'num_owners',
    'Seating_Capacity': 'seating_capacity',
    'Fuel_Type': 'fuel_type',
    'Fuel_Tank_Capacity(L)': 'fuel_capacity',
    'Engine_Type': 'engine_type',
    'CC_Displacement': 'cc_displacement',
    'Transmission': 'transmission_gears',
    'Transmission_Type': 'transmission_type',
    'Power(BHP)': 'bhp',
    'Torque(Nm)': 'torque',
    'Mileage(kmpl)': 'fuel_economy',
    'Emission': 'emission_class',
    'Price': 'price',
    'engine_litres': 'engine_litres',
    'drive_train': 'drive_train'
})

changing data types in the df: 

In [207]:
# These are string columns
df['name'] = df['name'].astype('string')
df['make'] = df['make'].astype('string')
df['model'] = df['model'].astype('string')
df['color'] = df['color'].astype('string')
df['body_style'] = df['body_style'].astype('string')
df['fuel_type'] = df['fuel_type'].astype('string')
df['engine_type'] = df['engine_type'].astype('string')
df['transmission_gears'] = df['transmission_gears'].astype('string')
df['transmission_type'] = df['transmission_type'].astype('string')
df['drive_train'] = df['drive_train'].astype('string')

# This is a date column
df['year'] = pd.to_datetime(df['year'])

# These are integer columns
df['mileage'] = df['mileage'].astype('Int64')  # Use 'Int64' (not 'int64') to support missing values (NaN)
df['num_owners'] = df['num_owners'].astype('Int64')
df['seating_capacity'] = df['seating_capacity'].astype('Int64')
df['fuel_capacity'] = df['fuel_capacity'].astype('Int64')
df['cc_displacement'] = df['cc_displacement'].astype('Int64')
df['emission_class'] = df['emission_class'].astype('Int64')
df['price'] = df['price'].astype('Int64')

# These are float columns
df['bhp'] = df['bhp'].astype('float64')
df['torque'] = df['torque'].astype('float64')
df['fuel_economy'] = df['fuel_economy'].astype('string')  # this one should be a float, but there are two enries that contain strings and have yet to be fixed
df['engine_litres'] = df['engine_litres'].astype('float64')


I have to just use strings for most of these fields right now because they are so drastically denormalized and inconsistent. I'm not going to put in a primary key yet, bc I want to delete duplicates first. 

In [208]:

# create sqlalchemy engine
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@localhost/car_db")

# Use 'replace' if you want to replace an existing table with the same name.
df.to_sql('cars', engine, if_exists='replace', index=False)


914

In [209]:
from sqlalchemy import create_engine, text
# Confirm that the data has been loaded successfully
with engine.connect() as conn:
    result_set = conn.execute(text("SELECT * FROM cars LIMIT 10"))
    for row in result_set:
        print(row)

('highline at (d)', 'volkswagen', 'ameo', datetime.datetime(1970, 1, 1, 0, 0, 0, 2), 'silver', 'sedan', 44611, 1, 5, 'd', 45, 'tdi', 1498, '7', 'a', 109.0, 250.0, '21.66', 4, 657000, 1.5, None)
('sx', 'hyundai', 'i20 active', datetime.datetime(1970, 1, 1, 0, 0, 0, 2), 'red', 'crossover', 20305, 1, 5, 'p', 45, 'kappa', 1197, '5', 'm', 82.0, 115.0, '17.19', 5, 682000, 1.2, None)
('vx', 'honda', 'wr-v', datetime.datetime(1970, 1, 1, 0, 0, 0, 2), 'white', 'suv', 29540, 2, 5, 'p', 40, 'i-vtec', 1199, '5', 'm', 88.5, 110.0, '16.5', 4, 793000, None, None)
('rxt amt', 'renault', 'kwid', datetime.datetime(1970, 1, 1, 0, 0, 0, 2), 'bronze', 'hatchback', 35680, 1, 5, 'p', 28, '', 999, '5', 'm', 67.0, 91.0, '21.7', 4, 414000, 1.0, None)
('asta', 'hyundai', 'grand i10', datetime.datetime(1970, 1, 1, 0, 0, 0, 2), 'orange', 'hatchback', 25126, 1, 5, 'p', 43, 'kappa vtvt', 1197, '5', 'm', 81.86, 113.75, '18.9', 5, 515000, 1.2, None)
('sportz', 'hyundai', 'elite i20', datetime.datetime(1970, 1, 1, 0, 0

Insert UUID primary key:

In [210]:
with engine.connect() as conn:
    conn.execute(text("CREATE EXTENSION IF NOT EXISTS pgcrypto"))
    conn.execute(text("ALTER TABLE cars ADD COLUMN id UUID PRIMARY KEY DEFAULT uuid_generate_v4()"))
    conn.commit()



In [211]:
with engine.connect() as conn:
    result_set = conn.execute(text("SELECT * FROM cars LIMIT 10"))
    for row in result_set:
        print(row)

('highline at (d)', 'volkswagen', 'ameo', datetime.datetime(1970, 1, 1, 0, 0, 0, 2), 'silver', 'sedan', 44611, 1, 5, 'd', 45, 'tdi', 1498, '7', 'a', 109.0, 250.0, '21.66', 4, 657000, 1.5, None, UUID('c4bc8c17-8f8c-45c6-ae0c-4f9ce054e9d1'))
('sx', 'hyundai', 'i20 active', datetime.datetime(1970, 1, 1, 0, 0, 0, 2), 'red', 'crossover', 20305, 1, 5, 'p', 45, 'kappa', 1197, '5', 'm', 82.0, 115.0, '17.19', 5, 682000, 1.2, None, UUID('524d8f21-ec39-4e09-addc-0dfcee82ab8a'))
('vx', 'honda', 'wr-v', datetime.datetime(1970, 1, 1, 0, 0, 0, 2), 'white', 'suv', 29540, 2, 5, 'p', 40, 'i-vtec', 1199, '5', 'm', 88.5, 110.0, '16.5', 4, 793000, None, None, UUID('7191ec04-b7c2-4cb1-94cd-e921cc665fa6'))
('rxt amt', 'renault', 'kwid', datetime.datetime(1970, 1, 1, 0, 0, 0, 2), 'bronze', 'hatchback', 35680, 1, 5, 'p', 28, '', 999, '5', 'm', 67.0, 91.0, '21.7', 4, 414000, 1.0, None, UUID('69fc32cd-c45e-46da-979b-678a591fdfea'))
('asta', 'hyundai', 'grand i10', datetime.datetime(1970, 1, 1, 0, 0, 0, 2), 'oran