In [1]:
import mysql.connector
import pandas as pd 
import numpy as np 
from config import HOST, DATABASE, USERNAME, PASSWORD

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [37]:
# connecting to grocery db

try:
    connection = mysql.connector.connect(
        host=HOST,
        database=DATABASE,
        user=USERNAME,
        password=PASSWORD
    )

    if connection.is_connected():
        print("Connected to MySQL database")

        # Your database operations go here

except mysql.connector.Error as err:
    print(f"Error: {err}")

# finally:
    # Close the connection in the finally block to ensure it's always closed
    #if 'connection' in locals() and connection.is_connected():
    #    connection.close()
    #    print("Connection closed")

cursor = connection.cursor() 

Connected to MySQL database


<mysql.connector.cursor_cext.CMySQLCursor at 0x1e6a4e3a720>

In [16]:
stores = ("SELECT * FROM STORE")
cursor.execute(stores)

In [17]:
store_data =[]
for i in cursor: 
    store_data.append(i)

col_names = [i[0] for i in cursor.description]

df = pd.DataFrame(store_data, columns = col_names)

df


Unnamed: 0,id,name,address
0,1,Fred Meyer,3805 SE Hawthorne Blvd
1,2,Winco,7979 SE Powell Blvd
2,3,US FOODS CHEFSTORE,731 SE Stephens St
3,4,Trader Joes,4121 NE Halsey St
4,5,Whole Foods,2825 E Burnside St
5,6,Target,3031 SE Powell Blvd
6,7,New Seasons Market,1954 SE Division St
7,8,Costco,"13130 SE 84th Ave, Clackamas, OR 97015"
8,9,Fred Meyer,6615 NE Glisan St
9,10,New Seasons Market,3210 NE Broadway


In [57]:
# okay I have successfully queried data from the mysql db and stored it in a pandas dataframe
#let's formalize this process by creating a function

In [79]:
#making sure I know how to parameterize a query...while avoiding an sql injection
def table_grabber(table):
    
    query = "SELECT * from {}"
    query_ready = query.format(table)

    cursor.execute(query_ready)

    data =[] # empty list to dump queried data into
    for x in cursor: 
        data.append(x)
    
    col_names = [i[0] for i in cursor.description]

    df = pd.DataFrame(data, columns = col_names)

    return df


In [80]:
# my lovely tables 
stores = table_grabber("store")
trips = table_grabber("trips")
items = table_grabber("items")

In [81]:
#what is average cost per grocery trip ? 

trips['total_cost'].mean() 

#$ 40.20 -- this aligns with my intuition. As a child I would accompany my mom to buy groceries. Her budget was usually $40. 

40.189756097560974

In [61]:
#how about average monthly grocery expenses? 

# I could do this easily in R with %>% and group_by(), but I need to refresh my pandas skills....

#first I need to make sure the date column in trips is a datetime object: 

trips.dtypes

id             int64
store_id       int64
date          object
total_cost    object
dtype: object

In [62]:
trips['date'] = pd.to_datetime(trips['date'])

In [63]:
trips.groupby(trips['date'].dt.strftime('%B'))['total_cost'].sum()

#dt.strftime is the accessor in pandas to grab a part of the datetime object.'%B' is the full month name
# here are some other options 

'''
    %Y: Year with century as a decimal number (e.g., 2022).
    %m: Month as a zero-padded decimal number (01, 02, ..., 12).
    %d: Day of the month as a zero-padded decimal number (01, 02, ..., 31).
    %H: Hour (00, 01, ..., 23).
    %M: Minute (00, 01, ..., 59).
    %S: Second (00, 01, ..., 59).
'''

# for example, the average per year: 

trips.groupby(trips['date'].dt.strftime('%Y'))['total_cost'].mean()

date
2023    39.411429
2024    41.866154
Name: total_cost, dtype: object

In [64]:
# group by week: 

trips.groupby(trips['date'].dt.strftime('%U'))['total_cost'].sum()

date
00    119.82
02     37.10
03    144.12
04     74.42
05    127.83
06     40.97
36     18.65
37     87.11
38    157.14
39     79.44
40    115.12
41     15.47
42     35.53
43     46.03
44      9.97
45    105.22
46     20.60
47    118.14
48     85.66
49     14.38
50     96.07
51     98.99
Name: total_cost, dtype: object

In [65]:
# How many trips per week on average do I go to the grocery store? 

trips.groupby(trips['date'].dt.strftime('%B'))['id'].count()

date
December     6
February     6
January      7
November     8
October      7
September    7
Name: id, dtype: int64

In [66]:
trips.count()

id            41
store_id      41
date          41
total_cost    41
dtype: int64

In [67]:
#summary statistics of trips table: 
trips.describe()

# september 7 to February 2016 





Unnamed: 0,id,store_id,date
count,41.0,41.0,41
mean,21.926829,4.560976,2023-11-27 15:48:17.560975616
min,1.0,1.0,2023-09-07 00:00:00
25%,11.0,2.0,2023-10-15 00:00:00
50%,21.0,4.0,2023-11-27 00:00:00
75%,33.0,8.0,2024-01-18 00:00:00
max,43.0,10.0,2024-02-16 00:00:00
std,12.861552,3.162663,


In [68]:
#how many days/months does this dataset cover ? 
time_span = trips['date'].max() - trips['date'].min()

# duration in months - a bit over 5 months of shopping data 
time_span.days / 30



5.4

In [38]:
# let's do something fun. I want to know the frequency of trips per grocery store. 
# to do this, I will need to join trips with store.

trip_freq_per_store = "SELECT s.name, count(t.date) FROM store as s JOIN trips as t ON s.id = t.store_id GROUP BY name"

cursor.execute(trip_freq_per_store)


In [50]:
#generalizing table grabber to accept any query, also using cursor.fetchall() command

def table_grabber2(query):

    cursor.execute(query)

    query_data = cursor.fetchall()
    
    col_names = [i[0] for i in cursor.description]

    df = pd.DataFrame(data, columns = col_names)

    return df

In [51]:
table_grabber2(trip_freq_per_store)

Unnamed: 0,name,count(t.date)
0,Fred Meyer,9
1,Winco,11
2,US FOODS CHEFSTORE,1
3,Trader Joes,3
4,Whole Foods,1
5,Target,1
6,New Seasons Market,4
7,Costco,11
