In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

import plotly.express as px
import folium

from datetime import datetime, date, timedelta
import sqlalchemy
from sqlalchemy import create_engine
import creds_panoply

from uszipcode import SearchEngine

## Data Import: Panopply - Household First Orders View

In [2]:
# Creating connection - using credentials from other file
POSTGRES_ADDRESS = creds_panoply.address
POSTGRES_PORT = creds_panoply.port
POSTGRES_USERNAME = creds_panoply.username
POSTGRES_PASSWORD = creds_panoply.pw
POSTGRES_DBNAME = creds_panoply.dbname

postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'.format(
    username=POSTGRES_USERNAME,
    password=POSTGRES_PASSWORD,
    ipaddress=POSTGRES_ADDRESS,
    port=POSTGRES_PORT,
    dbname=POSTGRES_DBNAME))

cnx = create_engine(postgres_str)

In [3]:

tbl_name = 'household_report_intermediate_views.household_first_orders_view'
sel_fields = 'household_hash, first_order_date'
sql_query = "SELECT " + sel_fields + " FROM " + tbl_name
df_hh = pd.read_sql_query(sql_query, cnx)

df_hh.head(3)

Unnamed: 0,household_hash,first_order_date
0,00004be666a66763b494d48fcf3148df,2019-10-25
1,000074da6127f8f197f780701d7585e6,2018-08-09
2,00007c52e6801912b89bb5d790ae3729,2019-04-08


## Data Manipulation

In [4]:
# Adding another column for month
df_hh['monthly_cohort'] = pd.to_datetime(df_hh['first_order_date']).dt.strftime('%Y-%m')
df_hh.head()

Unnamed: 0,household_hash,first_order_date,monthly_cohort
0,00004be666a66763b494d48fcf3148df,2019-10-25,2019-10
1,000074da6127f8f197f780701d7585e6,2018-08-09,2018-08
2,00007c52e6801912b89bb5d790ae3729,2019-04-08,2019-04
3,00007f1381425fe513b0d951341a4ef0,2018-09-10,2018-09
4,00008643eef23cddc6cec00cabcfe457,2019-05-21,2019-05


### Plot - Monthly new cohorts

In [5]:
# Table for new households
df_hh_monthly = df_hh.groupby('monthly_cohort').count()['household_hash'].reset_index()
df_hh_monthly.rename(columns={"household_hash":"NumOfHouseholds"}, inplace=True)
df_hh_monthly = df_hh_monthly[df_hh_monthly['monthly_cohort'] != "NaT"]
df_hh_monthly.head(3)

Unnamed: 0,monthly_cohort,NumOfHouseholds
0,2017-10,48
1,2017-11,371
2,2017-12,296


In [6]:
fig1 = px.bar(df_hh_monthly,
                y="NumOfHouseholds",
                x="monthly_cohort",
                title="Number of new Households by Month",
                
                template="ggplot2")
fig1.show()
#filename_prefix = datetime.now().strftime("%y%m%d")
filename_prefix="200526"
filename=filename_prefix+"_New_households_by_month.png"
output_location="./output/"+filename
#fig1.to_image(format='png')


## Plot - households by day

In [7]:
df_hh_daily = df_hh.groupby('first_order_date').count()['household_hash'].reset_index()
df_hh_daily.rename(columns={"household_hash":"# of Households"}, inplace=True)
df_hh_daily.head(3)

Unnamed: 0,first_order_date,# of Households
0,2017-10-24,4
1,2017-10-25,3
2,2017-10-26,10


In [8]:
fig2 = px.line(data_frame=df_hh_daily,
                x="first_order_date",
                y="# of Households",
                title="Number of new households per day")
fig2.show()

## Plot - trailing 30 days

In [9]:
n = 30
date_cutoff = date.today() - timedelta(n)
date_filter = df_hh['first_order_date']>=date_cutoff

df_trailing30 = df_hh[date_filter]
df_trailing30

TypeError: Invalid comparison between dtype=datetime64[ns] and date

In [10]:
df_trailing30 = df_trailing30.groupby('first_order_date').count()['household_hash'].reset_index()
df_trailing30.rename(columns={"household_hash":"# of Households"}, inplace=True)
df_trailing30.head()

NameError: name 'df_trailing30' is not defined

In [11]:
# Plot
fig3 = px.line(data_frame=df_trailing30,
                x="first_order_date",
                y="# of Households",
                title="Number of new households per day")
fig3.show()

NameError: name 'df_trailing30' is not defined