In [1]:
# Import Dependencies
import pandas as pd
import sqlite3 as sq

In [2]:
# Read amd store all csv files
rides_df = pd.read_csv('rides_raw.csv')
riders_df = pd.read_csv('riders_raw.csv')
drivers_df = pd.read_csv('drivers_raw.csv')
payments_df = pd.read_csv('payments_raw.csv')

# Create a Database Connection
conn = sq.connect('hng_ride.db')

# Write Dataframes to SQL Tables
#rides_df.to_sql('rides', conn, if_exists='replace', index=False)
#riders_df.to_sql('riders', conn, if_exists='replace', index=False)
#drivers_df.to_sql('drivers', conn, if_exists='replace', index=False)
#payments_df.to_sql('payments', conn, if_exists='replace', index=False)

##### Checking Rides Table for inconsistencies

In [4]:
# Get all table information
table_info_query = """
    PRAGMA table_info(rides)
    """
df_result = pd.read_sql_query(table_info_query, conn)
df_result

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,ride_id,INTEGER,0,,0
1,1,rider_id,INTEGER,0,,0
2,2,driver_id,INTEGER,0,,0
3,3,request_time,TIMESTAMP,0,,0
4,4,pickup_time,TIMESTAMP,0,,0
5,5,dropoff_time,TIMESTAMP,0,,0
6,6,pickup_city,TEXT,0,,0
7,7,dropoff_city,TEXT,0,,0
8,8,distance_km,REAL,0,,0
9,9,status,TEXT,0,,0


In [7]:

query = "SELECT * FROM rides"

df = pd.read_sql(
    sql=query,
    con=conn,
    parse_dates={
        'request_time': {'format': '%m/%d/%Y %H:%M'},
        'pickup_time': {'format': '%m/%d/%Y %H:%M'},
        'dropoff_time': {'format': '%m/%d/%Y %H:%M'}
    }
)

df.to_sql('rides', conn, if_exists='replace', index=False)

conn.commit()

print("\nData types:")
print(df.dtypes)

DatabaseError: Execution failed on sql 'DROP TABLE "rides"': database is locked

In [8]:
query = "SELECT * FROM payments"

df = pd.read_sql(
    sql=query,
    con=conn,
    parse_dates={'paid_date': {'format': 'mixed', 'errors': 'coerce'}}
)
df.to_sql('payments', conn, if_exists='replace', index=False)
conn.commit()
print(df)
print("\nData types:")
print(df.dtypes)


       payment_id  ride_id  amount   method           paid_date
0               1        1   32.26  pay pal 2022-01-27 10:42:00
1               4        4   26.73     cash 2021-02-26 03:34:00
2               5        5   55.85     card 2024-08-17 03:43:00
3               6        6    7.20  voucher 2024-09-22 13:34:00
4               7        7   26.40     card 2021-05-26 07:58:00
...           ...      ...     ...      ...                 ...
39413       49993    49993   25.14     card 2021-10-20 15:31:00
39414       49994    49994   62.53   paypal 2023-10-24 14:53:00
39415       49995    49995   65.95   paypal 2022-04-16 16:33:00
39416       49996    49996   43.83   paypal 2023-12-04 07:36:00
39417       49997    49997   47.13   paypal 2024-06-16 09:23:00

[39418 rows x 5 columns]

Data types:
payment_id             int64
ride_id                int64
amount               float64
method                object
paid_date     datetime64[ns]
dtype: object


Checking status feature for inconsistencies

In [5]:
# Checking status feature for inconsistencies
query = """
    SELECT DISTINCT status FROM rides
    """
df_result = pd.read_sql_query(query, conn)
df_result

Unnamed: 0,status
0,completed
1,cancelled


In [52]:
# Correct spelling "complted" to "completed"
update_query = """
    UPDATE rides
    SET status = "completed"
    WHERE status = "complted"
    """
try:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the update
    cursor.execute(update_query )

    # Commit the changes to the database
    conn.commit()

    rows_changed = cursor.rowcount
    print(f"Successfully updated {rows_changed} rows.")

except Exception as e:
    print(f"An error occured: {e}")
    conn.rollback()

finally:
    # Close the Cursor
    cursor.close()

Successfully updated 955 rows.


In [53]:
# Checking status feature for spelling correction
query = """
    SELECT DISTINCT status FROM rides
    """
df_result = pd.read_sql_query(query, conn)
df_result

Unnamed: 0,status
0,completed
1,cancelled


In [54]:
# Rehecking status feature for inconsistencies
query = """
    SELECT DISTINCT status FROM rides
    """
df_result = pd.read_sql_query(query, conn)
df_result

Unnamed: 0,status
0,completed
1,cancelled


Checking fare for inconsistencies

In [55]:
#  Checking fare feature for payment inconsistencies
find_query = """
    SELECT fare
    FROM rides
    WHERE fare <= 0
    """

df_result = pd.read_sql_query(find_query, conn)
df_result

Unnamed: 0,fare
0,-18.00
1,-0.85
2,-6.54
3,-21.70
4,-28.87
...,...
648,-32.33
649,-0.50
650,-40.02
651,-19.04


In [56]:
# Delete from fare feature where payments are below or equal to zero

delete_query = """
    DELETE FROM rides
    WHERE fare <= 0
    """

try:
    # Create a cursor Object
    cursor = conn.cursor()

    # Execute the cursor
    cursor.execute(delete_query)

    # Commit chnages
    conn.commit()

    row_count = cursor.rowcount
    print(f"Successfully deleted {row_count} rows.")
except Exception as e:
    # If error, indicate and print the error name
    print(f"An error occured: {e}")
    conn.rollback()

finally:
    # Close the Cursor
    cursor.close()

Successfully deleted 653 rows.


In [57]:
# Re-run the count query
invalid_count = pd.read_sql_query(find_query, conn)

print("--- Verification ---")
print(invalid_count)


--- Verification ---
Empty DataFrame
Columns: [fare]
Index: []


Checking pickup_city for inconsistencies

In [58]:
query = """
    SELECT DISTINCT pickup_city
    FROM rides
    """

df_result = pd.read_sql_query(query, conn)
df_result

Unnamed: 0,pickup_city
0,San Francisco
1,Calgary
2,New York
3,Vancouver
4,Los Angeles
5,Ottawa
6,Toronto
7,Montreal
8,Chicago
9,Boston


Checking dropoff_city for inconsistencies

In [59]:
query = """
    SELECT DISTINCT dropoff_city
    FROM rides
    """

df_result = pd.read_sql_query(query, conn)
df_result

Unnamed: 0,dropoff_city
0,Vancouver
1,Toronto
2,Chicago
3,San Francisco
4,Boston
5,Montreal
6,Ottawa
7,Los Angeles
8,Calgary
9,New York


In [60]:
query = """
    SELECT DISTINCT max(distance_km)
    FROM rides
    """

df_result = pd.read_sql_query(query, conn)
df_result

Unnamed: 0,max(distance_km)
0,30.0


In [61]:
alter_query = """
    ALTER TABLE rides 
    ADD COLUMN request_time_new DATETIME;
    """

try:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the update
    cursor.execute(alter_query)

    # Commit the changes to the database
    conn.commit()

    rows_changed = cursor.rowcount
    print(f"Successfully updated {rows_changed} rows.")

except Exception as e:
    print(f"An error occured: {e}")
    conn.rollback()

finally:
    # Close the Cursor
    cursor.close()

Successfully updated -1 rows.


In [62]:
alter_query = """
    ALTER TABLE rides 
    ADD COLUMN pickup_time_new DATETIME;
    """

try:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the update
    cursor.execute(alter_query)

    # Commit the changes to the database
    conn.commit()

    rows_changed = cursor.rowcount
    print(f"Successfully updated {rows_changed} rows.")

except Exception as e:
    print(f"An error occured: {e}")
    conn.rollback()

finally:
    # Close the Cursor
    cursor.close()

Successfully updated -1 rows.


In [63]:
alter_query = """
    ALTER TABLE rides 
    ADD COLUMN dropoff_time_new DATETIME;
    """

try:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the update
    cursor.execute(alter_query)

    # Commit the changes to the database
    conn.commit()

    rows_changed = cursor.rowcount
    print(f"Successfully updated {rows_changed} rows.")

except Exception as e:
    print(f"An error occured: {e}")
    conn.rollback()

finally:
    # Close the Cursor
    cursor.close()

Successfully updated -1 rows.


Converting date stamps to date

In [66]:
cast_query = """
    UPDATE rides
    SET
    request_time_new = CAST(request_time AS DATETIME),
    pickup_time_new = CAST(pickup_time AS DATETIME),
    dropoff_time_new = CAST(dropoff_time AS DATETIME)
    """
try:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the update
    cursor.execute(cast_query)

    # Commit the changes to the database
    conn.commit()

    rows_changed = cursor.rowcount
    print(f"Successfully updated {rows_changed} rows.")

except Exception as e:
    print(f"An error occured: {e}")
    conn.rollback()

finally:
    # Close the Cursor
    cursor.close()


Successfully updated 49347 rows.


In [65]:
# Get all table information
table_info_query = """
    PRAGMA table_info(rides);
    """
df_result = pd.read_sql_query(table_info_query, conn)
df_result

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,ride_id,INTEGER,0,,0
1,1,rider_id,INTEGER,0,,0
2,2,driver_id,INTEGER,0,,0
3,3,request_time,TEXT,0,,0
4,4,pickup_time,TEXT,0,,0
5,5,dropoff_time,TEXT,0,,0
6,6,pickup_city,TEXT,0,,0
7,7,dropoff_city,TEXT,0,,0
8,8,distance_km,REAL,0,,0
9,9,status,TEXT,0,,0


In [67]:
# Get all table information
table_info_query = """
    PRAGMA table_info(drivers);
    """
df_result = pd.read_sql_query(table_info_query, conn)
df_result

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,driver_id,REAL,0,,0
1,1,name,TEXT,0,,0
2,2,city,TEXT,0,,0
3,3,signup_date,TEXT,0,,0
4,4,rating,REAL,0,,0


In [71]:
alter_query = """
    ALTER TABLE drivers
    ADD COLUMN signup_date_new DATETIME
    """

try:
    
    cursor = conn.cursor()

    cursor.execute(alter_query)

    conn.commit()

except Exception as e:
    print(f"An error occured: {e}.")
    conn.rollback
    
finally:
    cursor.close()

    

In [76]:
cast_query = """
    UPDATE drivers
    SET
    signup_date_new = CAST(signup_date AS DATETIME)
    """
try:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the update
    cursor.execute(cast_query)

    # Commit the changes to the database
    conn.commit()

    rows_changed = cursor.rowcount
    print(f"Successfully updated {rows_changed} rows.")

except Exception as e:
    print(f"An error occured: {e}")
    conn.rollback()

finally:
    # Close the Cursor
    cursor.close()


Successfully updated 2002 rows.


In [77]:
alter_query = """
    ALTER TABLE riders
    ADD COLUMN signup_date_new DATETIME
    """

try:
    
    cursor = conn.cursor()

    cursor.execute(alter_query)

    conn.commit()

except Exception as e:
    print(f"An error occured: {e}.")
    conn.rollback
    
finally:
    cursor.close()


In [7]:
cast_query = """
    UPDATE riders
    SET
    signup_date_new =  CONVERT(DATETIME, signup_date, 101);
    """
try:
    # Create a cursor object
    cursor = conn.cursor()

    # Execute the update
    cursor.execute(cast_query)

    # Commit the changes to the database
    conn.commit()

    rows_changed = cursor.rowcount
    print(f"Successfully updated {rows_changed} rows.")

except Exception as e:
    print(f"An error occured: {e}")
    conn.rollback()

finally:
    # Close the Cursor
    cursor.close()

An error occured: name 'conn' is not defined


NameError: name 'cursor' is not defined

In [80]:
#  Checking payments feature for payment inconsistencies
find_query = """
    SELECT amount
    FROM payments
    WHERE amount <= 0
    """

df_result = pd.read_sql_query(find_query, conn)
df_result

Unnamed: 0,amount
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
10577,0.0
10578,0.0
10579,0.0
10580,0.0


In [82]:
# Delete from paymets feature where payments are below or equal to zero

delete_query = """
    DELETE FROM payments
    WHERE amount <= 0
    """

try:
    # Create a cursor Object
    cursor = conn.cursor()

    # Execute the cursor
    cursor.execute(delete_query)

    # Commit chnages
    conn.commit()

    row_count = cursor.rowcount
    print(f"Successfully deleted {row_count} rows.")
except Exception as e:
    # If error, indicate and print the error name
    print(f"An error occured: {e}")
    conn.rollback()

finally:
    # Close the Cursor
    cursor.close()

Successfully deleted 10582 rows.


In [83]:
df_result = pd.read_sql_query(find_query, conn)
df_result

Unnamed: 0,amount


### Find the top 10 longest rides (by distance), including driver name, rider name, pickup/dropoff cities, and payment method.

In [1]:
%pip install jupysql prettytable --upgrade

Note: you may need to restart the kernel to use updated packages.


In [2]:
%load_ext sql

In [3]:
# This creates the connection for the %%sql magic
%sql sqlite:///hng_ride.db

In [12]:
%%sql
UPDATE drivers
SET signup_date_new =  CONVERT(DATETIME, signup_date, 101);

RuntimeError: (sqlite3.OperationalError) no such column: DATETIME
[SQL: UPDATE drivers
SET signup_date_new =  CONVERT(DATETIME, signup_date, 101);]
(Background on this error at: https://sqlalche.me/e/20/e3q8)
