## Ingesting, cleaning, standardizing, normalizing, and loading:
First creating and initiating our venv:

In [None]:
# python3 -m venv venv


In [None]:
# . venv/bin/activate

In [None]:
# !pip install pandas
# !pip install sqlalchemy
# !pip install python-dotenv

Let's ingest to a df

In [None]:
import pandas as pd

df = pd.read_csv("/Users/bfaris96/Desktop/turing-proj/cars_db/data/FINAL_SPINNY_900.csv")

In [None]:
df.shape

In [None]:
df.head(15)

Checking for any nulls:

In [None]:
print(df[df.isnull().any(axis=1)])

Stripping extra whitespace:

In [None]:
df = df.applymap(lambda x: ' '.join(x.split()) if isinstance(x, str) else x)

Forcing lowercase:

In [None]:
df = df.applymap(lambda s: s.lower() if type(s) == str else s)

Eliminating duplicate entries:

In [None]:
df = df.drop_duplicates()

In [None]:
df.shape

In [None]:
df["Engine_Type"].value_counts()

Removing all non-numeric characters from mileage and price columns. 

- Note about my process (which I am willing to change): I have used regex when I need to parse or alter part of a string within a cell. If I can just use the whole cell contents, I use if/elif statements. I do this because I want to avoid regex, because I find it annoying. Regex is also slow, but that doesn't really matter at this scale (small data).

In [None]:
df['Mileage_Run'] = df['Mileage_Run'].str.replace(r'\D', '', regex=True)
df['Price'] = df['Price'].str.replace(r'\D', '', regex=True)

Removing year range from name field:

In [None]:
import re

def remove_date_name(row):
    """
    Removes date range pattern [YYYY-YYYY] from "Car_Name" in a DataFrame row.

    Parameters
    ----------
    row : pandas.Series
        A DataFrame row with a "Car_Name" field.

    Returns
    -------
    str
        "Car_Name" field without the date range pattern.

    Example
    --------
    >>> remove_date_name(pd.Series({"Car_Name": "Ford Mustang [2000-2018]"}))
    'Ford Mustang'
    """
    name_list = list(row["Car_Name"].split())
    year_pattern = re.compile(r"\[(\d{4})-(\d{4})\]")
    year_match = re.search(year_pattern, row["Car_Name"])

    if year_match and year_match.group(0) in name_list:
        name_list.remove(year_match.group(0))

    return " ".join(name_list)
    
# Usage
df["Car_Name"] = df.apply(remove_date_name, axis=1)


Removing redundant words from engine_type field:

In [None]:
import re

def clean_engine(df):
    """
    Cleans the "Engine_Type" field in a DataFrame row.

    This function removes unwanted words and patterns from the "Engine_Type" field of a DataFrame row.

    Parameters
    ----------
    df : pandas.Series
        A DataFrame row which contains an "Engine_Type" field.

    Returns
    -------
    str
        The cleaned "Engine_Type" string.

    Example
    --------
    >>> clean_engine(pd.Series({"Engine_Type": "2 speed petrol engine"}))
    ''
    """
    engine_string = df["Engine_Type"]
    
    # Remove the pattern of number followed by the word "speed"
    engine_string = re.sub(r'\b\d+\s*speed\b', '', engine_string)

    engine_list = engine_string.split()

    # List of words to remove from the engine type
    unwanted_words = ["petrol", "(petrol)", "diesel", "(diesel)", "cng", "(cng)", "lpg", "electric", 
                      "petrol+cng", "petrol+electric", "engine", "automatic", "manual", "transmission"]

    # Remove unwanted words
    engine_list = [word for word in engine_list if word not in unwanted_words]

    new_engine = " ".join(engine_list)

    return new_engine

# Usage
df["Engine_Type"] = df.apply(clean_engine, axis=1)


Inserting engine_litres columns:

In [None]:
df['engine_litres'] = None

Moving engine litres data from either name or engine_type fields into new engine_litres field

In [None]:
import re

def move_liters(row):
    """
    Extracts the engine size in liters from "Car_Name" or "Engine_Type" in a DataFrame row.

    This function matches patterns for engine sizes (e.g., '2.5L' or '2.5') and returns the first match found.
    If no match is found, it returns None.

    Parameters
    ----------
    row : pandas.Series
        A DataFrame row which contains "Car_Name" and "Engine_Type" fields.

    Returns
    -------
    str or None
        The extracted engine size in liters as a string, or None if no match is found.

    Example
    --------
    >>> move_liters(pd.Series({"Car_Name": "Ford Mustang 2.5L", "Engine_Type": "Petrol"}))
    '2.5'
    """
    liter_pattern = re.compile(r"\b\d+\.\d+[lL]\b")
    no_l_liter_pattern = re.compile(r"\b\d+\.\d+\b")
    
    fields = [row["Car_Name"], row["Engine_Type"]]

    for pattern in [liter_pattern, no_l_liter_pattern]:
        for field in fields:
            match = re.search(pattern, field)
            if match:
                if pattern == liter_pattern:
                    return match.group(0)[:-1]
                elif pattern == no_l_liter_pattern:
                    return match.group(0)
    return None

# Usage
df['engine_litres'] = df.apply(move_liters, axis=1)


More patterns to be removed left over from litre information:

In [None]:
import re

def clean_column_data(df, columns):
    """
    Cleans data in specified DataFrame columns by removing matched patterns.

    This function applies regular expression patterns to specified DataFrame columns and removes matched substrings.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame containing the specified columns.
    columns : list
        List of column names in the DataFrame to be cleaned.

    Returns
    -------
    pandas.DataFrame
        DataFrame with cleaned column data.

    Example
    --------
    >>> df = pd.DataFrame({"Car_Name": ["Honda 1.6l", "Toyota 2.0 litre"], 
                           "Engine_Type": ["gasoline", "2.2 l"]})
    >>> clean_column_data(df, ["Car_Name", "Engine_Type"])
    """
    # Regular expression patterns to be removed
    patterns = [
        r"\b\d+\.\d+\s[lL]\b",  # Matches '1.2 l' or '2.2 l' with a space before the 'l'
        r"\b\d+\.\d+[lL]?\b",  # Matches '1.6l' or '1.9l' with no space before the 'l' and 'l' is optional
        r"( litre)|(-litre)",  # Matches ' litre' or '-litre'
        r"gasoline"  # Matches 'gasoline'
    ]

    # Apply each pattern to each column
    for column in columns:
        for pattern in patterns:
            df[column] = df[column].apply(lambda x: re.sub(pattern, "", x))

    return df

# Usage
df = clean_column_data(df, ["Car_Name", "Engine_Type"])


Remove redundant words (that appear in other fields) from car name field:

In [None]:
def clean_name(row):
    """
    Cleans "Car_Name" field in a DataFrame row.

    This function removes words from "Car_Name" that also appear in "Make", "Model", "Engine_Type", or "Fuel_Type" fields.

    Parameters
    ----------
    row : pandas.Series
        A DataFrame row which contains "Car_Name", "Make", "Model", "Engine_Type", and "Fuel_Type" fields.

    Returns
    -------
    str
        The cleaned "Car_Name" string.

    Example
    --------
    >>> clean_name(pd.Series({"Car_Name": "Toyota Camry 2.5L Petrol", "Make": "Toyota", 
                              "Model": "Camry", "Engine_Type": "2.5L", "Fuel_Type": "Petrol"}))
    ''
    """
    # Split the fields into individual words
    name_list = row["Car_Name"].split()
    make_words = row["Make"].split()
    model_words = row["Model"].split()
    engine_type_words = row["Engine_Type"].split()

    # Filter name list
    new_name_list = [word for word in name_list if word not in make_words and 
                     word not in model_words and word not in engine_type_words and 
                     word not in row["Fuel_Type"]]

    return " ".join(new_name_list)

# Usage
df["Car_Name"] = df.apply(clean_name, axis=1)


Remove drive train info from engine type, put in new drive train column:

In [None]:
df["drive_train"] = None

In [None]:
def move_drive_train(row):
    """
    Extracts drive train information from the "Engine_Type" field in a DataFrame row.

    This function uses regular expression to match drive train types such as 'four wheel drive' or 'rear wheel drive'.

    Parameters
    ----------
    row : pandas.Series
        A DataFrame row which contains an "Engine_Type" field.

    Returns
    -------
    str or None
        The matched drive train string, or None if no match was found.

    Example
    --------
    >>> move_drive_train(pd.Series({"Engine_Type": "V8 four wheel drive"}))
    'four wheel drive'
    """
    # Create a regular expression to match drive train types
    drive_train_pattern = re.compile(r"\b\w+\s+wheel\s+drive\b")

    match = re.search(drive_train_pattern, row["Engine_Type"])
    if match is not None:
        return match.group(0)

# Usage
df["drive_train"] = df.apply(move_drive_train, axis=1)


Remove drive train from engine type: 

In [None]:
def remove_drive_train(df):
    """
    Removes drive train information from the "Engine_Type" field in the DataFrame.

    This function uses regular expression to match and remove drive train types such as 'four wheel drive' or 'rear wheel drive'.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame which contains an "Engine_Type" field.

    Returns
    -------
    pandas.DataFrame
        DataFrame with cleaned "Engine_Type" field.

    Example
    --------
    >>> df = pd.DataFrame({"Engine_Type": ["V8 four wheel drive", "V6 rear wheel drive"]})
    >>> remove_drive_train(df)
    """
    # Create a regular expression to match drive train types
    drive_train_pattern = re.compile(r"\b\w+\s+wheel\s+drive\b")

    # Remove the drive train from the "Engine_Type" field
    df["Engine_Type"] = df["Engine_Type"].apply(lambda x: re.sub(drive_train_pattern, "", x))

    return df

# Usage
df = remove_drive_train(df)


To further understand how to standardize and constrain columns, I'll inspect all the unique values for relevant columns:

In [None]:
for column in df.columns:
    print(f"Column: {column}")
    print(df[column].value_counts())

Now I'll convert owner ordinals to an int:

In [None]:
def owner_to_int(df):
    """
    Converts the 'No_of_Owners' column from ordinal strings to integers.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame which contains a 'No_of_Owners' field.

    Returns
    -------
    pandas.DataFrame
        DataFrame with 'No_of_Owners' field converted to integers.

    Example
    --------
    >>> df = pd.DataFrame({"No_of_Owners": ["1st", "2nd", "3rd"]})
    >>> owner_to_int(df)
    """
    # Create a mapping dictionary
    mapping = {'1st': 1, '2nd': 2, '3rd': 3, '4th': 4, '5th': 5}

    # Apply the mapping to the 'No_of_Owners' column
    df['No_of_Owners'] = df['No_of_Owners'].map(mapping)

    return df

# Usage
df = owner_to_int(df)


Remove "+" from these fields:

In [None]:
def shorten_fuel_type(df):
    """
    Shortens the values in the 'Fuel_Type' column of a DataFrame.

    The function replaces 'petrol+cng' with 'petrol_cng' and 'petrol+electric' with 'petrol_electric'.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame which contains a 'Fuel_Type' field.

    Returns
    -------
    pandas.DataFrame
        DataFrame with modified 'Fuel_Type' field.

    Example
    --------
    >>> df = pd.DataFrame({"Fuel_Type": ["petrol+cng", "petrol+electric"]})
    >>> shorten_fuel_type(df)
    """
    # Mapping dictionary
    map_dict = {'petrol+cng': 'petrol_cng', 'petrol+electric': 'petrol_electric'}

    # Apply the mapping to the 'Fuel_Type' column
    df['Fuel_Type'] = df['Fuel_Type'].map(map_dict)

    return df

# Usage
df = shorten_fuel_type(df)


Now I will shorten the transmission gears field to 1 character:

In [None]:
def shorten_transmission_gears(df):
    """
    Shortens the values in the 'Transmission' column of a DataFrame.
    
    The function replaces '7-speed' with '7' and '6-speed' with '6'.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame which contains a 'Transmission' field.

    Returns
    -------
    pandas.DataFrame
        DataFrame with modified 'Transmission' field.

    Example
    --------
    >>> df = pd.DataFrame({"Transmission": ["7-speed", "6-speed"]})
    >>> shorten_transmission_gears(df)
    """

    map_dict = {'7-speed':7, '6-speed':6, '5-speed':5, '4-speed':4}
    df['Transmission'] = df['Transmission'].map(map_dict)
    return df

# Usage
df = shorten_transmission_gears(df)

Shorten the drive train field:

In [None]:
def shorten_drive_train(df):
    """ 
    Shortens the values in the 'drive_train' column of a DataFrame.

    The function replaces 'front wheel drive' with 'fwd', 'rear wheel drive' with 'rwd', 'all wheel drive' with 'awd', and 'four wheel drive' with '4wd'.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame which contains a 'drive_train' field.

    Returns
    -------
    pandas.DataFrame
        DataFrame with modified 'drive_train' field.

    Example
    --------
    >>> df = pd.DataFrame({"drive_train": ["front wheel drive", "rear wheel drive", "all wheel drive", "four wheel drive"]})
    >>> shorten_drive_train(df)
    """

    map_dict = {'front wheel drive': 'fwd', 'rear wheel drive': 'rwd', 'all wheel drive': 'awd', 'four wheel drive': '4wd'}
    df['drive_train'] = df['drive_train'].map(map_dict)
    return df

# Usage
df = shorten_drive_train(df)

Strip whitespace and commas:

In [None]:
df["Engine_Type"] = df["Engine_Type"].str.rstrip()
df["Engine_Type"] = df["Engine_Type"].str.lstrip()
# Strip commas
df["Engine_Type"] = df["Engine_Type"].str.replace(",", "")
df["Engine_Type"] = df["Engine_Type"].str.replace("  ", " ")

Inspecting again:

In [None]:
import pandas as pd

# Set the max number of rows to None (no limit)
pd.set_option('display.max_rows', None)

for column in df.columns:
    print(f"\nValue counts for {column}:")
    print(df[column].value_counts())

Found errant data in mileage field. Adding to "to_do" to address with data owner/stakeholders

In [None]:
df[df["Mileage(kmpl)"] == "bs iv"]

Changing df col names to be more apporpriate for db:

In [None]:
# rename dataframe columns to match with the table column names
df = df.rename(columns={
    'Car_Name': 'name',
    'Make': 'make',
    'Model': 'model',
    'Make_Year': 'year',
    'Color': 'color',
    'Body_Type': 'body_style',
    'Mileage_Run': 'mileage',
    'No_of_Owners': 'num_owners',
    'Seating_Capacity': 'seating_capacity',
    'Fuel_Type': 'fuel_type',
    'Fuel_Tank_Capacity(L)': 'fuel_capacity',
    'Engine_Type': 'engine_type',
    'CC_Displacement': 'cc_displacement',
    'Transmission': 'transmission_gears',
    'Transmission_Type': 'transmission_type',
    'Power(BHP)': 'bhp',
    'Torque(Nm)': 'torque',
    'Mileage(kmpl)': 'fuel_economy',
    'Emission': 'emission_class',
    'Price': 'price',
    'engine_litres': 'engine_litres',
    'drive_train': 'drive_train'
})

Dropping duplicates again, after cleaning:

In [None]:
df = df.drop_duplicates()

Checking max len of each column in the df:

In [None]:
max_len_dict = {df.columns[i]: df.iloc[:, i].astype(str).map(len).max() for i in range(df.shape[1])}

print(max_len_dict)

Connecting to postgres db:

In [None]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv

load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

engine = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@localhost/car_db")

with engine.connect() as conn:
    conn.execute(text("""
        DROP TABLE IF EXISTS cars;
        CREATE TABLE cars (
            name VARCHAR(64),
            make VARCHAR(64),
            model VARCHAR(64),
            year SMALLINT,
            color VARCHAR(12),
            body_style VARCHAR(24),
            mileage INTEGER,
            num_owners SMALLINT,
            seating_capacity SMALLINT,
            fuel_type VARCHAR(24),
            fuel_capacity SMALLINT,
            engine_type VARCHAR(255),
            cc_displacement SMALLINT,
            transmission_gears VARCHAR(3),
            transmission_type VARCHAR(10),
            bhp REAL,
            torque REAL,
            fuel_economy VARCHAR(24),
            emission_class VARCHAR(10),
            price INTEGER,
            engine_litres REAL,
            drive_train VARCHAR(3)
        );
    """))
    conn.commit()


## In the above cell, fuel_economy should be a real, but there are two entries that contain strings and have yet to be fixed


Loading data from df into postgresdb:

In [None]:

# Create sqlalchemy engine
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@localhost/car_db")

df.to_sql('cars', engine, if_exists='append', index=False)


Checking to see if loaded:

In [None]:
with engine.connect() as conn:
    result_set = conn.execute(text("SELECT * FROM cars LIMIT 10"))
    for row in result_set:
        print(row)

Insert serial int primary key:

In [None]:
with engine.connect() as conn:
    conn.execute(text("ALTER TABLE cars ADD COLUMN id SERIAL PRIMARY KEY;"))
    conn.commit()

In [None]:
with engine.connect() as conn:
    result_set = conn.execute(text("SELECT * FROM cars LIMIT 10"))
    for row in result_set:
        print(row)

In the cell below, I drop the Toyota Yarises with dual vvt-i with cc displacement of 1596, bc it is actually 1496 for this engine and the cells are duplicated with rows that are correct.

In [None]:
with engine.connect() as conn:
    conn.execute(text("""
        DELETE FROM public.cars
        WHERE model = 'yaris' 
        AND make = 'toyota'
        AND engine_type = 'dual vvt-i'
        AND cc_displacement = 1596
        """))
    conn.commit()

## Creating a read-only user:

In [None]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv

load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

engine = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@localhost/car_db")

with engine.connect() as conn:
    conn.execute(text("""
        CREATE USER car_reader WITH PASSWORD 'read_only';
        GRANT CONNECT ON DATABASE car_db TO car_reader;
        GRANT USAGE ON SCHEMA public TO car_reader;
        GRANT SELECT ON ALL TABLES IN SCHEMA public TO car_reader;
        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO car_reader;
        """))
    conn.commit()

In [None]:
with engine.connect() as conn:
    result_set = conn.execute(text("SELECT * FROM pg_roles"))
    for row in result_set:
        print(row)