In [None]:
###--- Standard Imports ---###
import os
import pandas as pd
import numpy as np
from datetime import datetime as dt

#Bokeh is used for plotting histograms
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

PATH = os.getcwd() #Pulls your current working directory
FILE = 'Customers_Enhanced.csv'
FULL_PATH = os.path.join(PATH, FILE) #Joins directory and file as one string

#A function for plotting histograms
def quick_hist(an_array, title='Default Title',y_lbl='Y Axis', x_lbl='X Axis',bins=5):
    hist, edges = np.histogram(an_array, bins=bins) #Find edges given a specified number of bins
    p = figure(width=700, height=400, title=title,y_axis_label=y_lbl,x_axis_label=x_lbl)
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="#606c75", line_color="white")
    show(p)
    return hist, edges

#Print basic descriptive statistics from a 1D array
def basic_stats(array, title='Default'):
    print(title)
    print('Min:', array.min())
    print('Median:', array.median())
    print('Avg:', round(array.mean(),2))
    print('Std Dev:', round(array.std(),2))
    print('Max:', array.max())
    return array.min(), array.median(), round(array.mean(),2), array.max()

#A function for specifically grabbing a single row from a DataFrame
def get_one_row(dataframe, row_name='Plan'):
    return [i for i in dataframe.loc[row_name, :]]

#Find year-over-year percentage change from a given list of values
def yoy_pct_change(list_of_values):
    return [round((((list_of_values[idx+1])-list_of_values[idx])/list_of_values[idx])*100,2) for idx in range(0, len(list_of_values)-1)]

#Plot a line graph of a given set of x-values (and y-values if you give them)
def lineplot(x_values, title='Title',x_lbl='X-Axis', y_lbl='Y-Axis',**kwargs):
    import matplotlib.pyplot as plt

    if 'y_values' in kwargs:
        y = kwargs['y_values']
    else:
        y = range(0, len(x_values),1)
    if 'size' in kwargs:
        size = kwargs['size']
    else:
        size=(10,7)
    if 'xticks' in kwargs:
        xticks = kwargs['xticks']
    else:
        xticks = y
    plt.figure(figsize=size)
    plt.plot(y, x_values)
    plt.xlabel(x_lbl, fontweight='demibold')
    plt.ylabel(y_lbl, fontweight='demibold')
    plt.xticks(y,labels=xticks, rotation='vertical')
    plt.grid(True, which='major', axis='y', alpha=0.6)    
    plt.title(title, fontsize='x-large', fontweight='demibold')
    plt.show()

#A function for concatenating column names together to form year-over-year column labels
def yoy_label_maker(dataframe):
    return [str(dataframe.columns.tolist()[col])+'-'+str(dataframe.columns.tolist()[col+1]) for col in range(0, len(dataframe.columns.tolist())-1)]

#Import data from file
data = pd.read_csv(FULL_PATH, sep='\t', encoding='utf-8') 
sales_data = pd.read_csv(os.path.join(PATH,'Months_on_Plan_Historical.csv'), sep='\t', encoding='utf-8')
len(data)

In [None]:
#Consolidate historical sales data
historical_sales = sales_data*10 #Sales per month per customer per year TIMES the Basic plan rate of $10
df_hist_sales = pd.DataFrame(historical_sales.sum())
df_hist_sales = df_hist_sales.transpose() #Transpose from row to column
df_hist_sales.index = ['Plan'] #Name index
df_hist_sales.columns = [col for col in map(int,df_hist_sales.columns.tolist())]

#Add cumulative customers for YOY stats
cust_since_yr = data.loc[:,['Customer_Since_Year', 'Plan']].groupby(['Customer_Since_Year']).count()
cust_since_yr = cust_since_yr.transpose() #Transpose from row to column
temp_cust = 0
temp_range = []
for cust in range(0, len(cust_since_yr.transpose())):
    temp_cust = temp_cust + cust_since_yr.iloc[:, cust][0] #Add last year's customers to this year's customers
    temp_range.append(temp_cust)
csy_2 = pd.DataFrame(temp_range) #Convert list into DataFrame
csy_2 = csy_2.transpose() #Transpose from row to column
csy_2.index = ['Plan'] #Name index
csy_2.columns = cust_since_yr.columns #Add column names from previous DataFrame

avg_income_per_cust_per_yr = round(df_hist_sales/csy_2,2) #Avg income per customer per year
avg_ipcpy = get_one_row(avg_income_per_cust_per_yr, row_name='Plan') #Pull one row for plotting
avg_cust_growth_per_year = get_one_row(cust_since_yr, row_name='Plan') #Find avg growth per customer per year

In [None]:
#Plot everything
lineplot(list(csy_2.loc['Plan',:])[0:-1], title='Customer Growth 2003-2022', size=(10,4),xticks=csy_2.columns.tolist()[0:-1], y_lbl='Customers (Cumulative)', x_lbl='')
lineplot(yoy_pct_change(list(csy_2.loc['Plan',:]))[0:-1],title='Customer Growth (%) 2003-2022', size=(10,4),xticks=yoy_label_maker(csy_2)[0:-1], y_lbl='% of Change (YOY)', x_lbl='')
lineplot(avg_income_per_cust_per_yr.transpose()[0:-1], size=(10,4), title='Average Revenue per Customer per Year (USD)', y_lbl='USD', x_lbl='', y_values=cust_since_yr.columns.tolist()[0:-1])
lineplot(yoy_pct_change(avg_cust_growth_per_year)[0:-1], title='Average Revenue per Customer per Year (%)', size=(10,4), xticks=yoy_label_maker(csy_2)[0:-1], y_lbl='% of Growth (YOY)', x_lbl='') #YOY Customer growth
lineplot(avg_cust_growth_per_year[0:-1], size=(10,4), y_values=cust_since_yr.columns.tolist()[0:-1], title='Customer Growth (Non-Cumulative) 2003-2022', y_lbl='Customers', x_lbl='') #New customers per year


In [None]:
#Basic Demographics
sex_m = data.query('Sex =="M"').count()[0]
sex_f = data.query('Sex =="F"').count()[0]

age_m = data.loc[:,['Sex', 'Age']].query('Sex == "M"').Age
age_f = data.loc[:,['Sex', 'Age']].query('Sex == "F"').Age

print('Male:', sex_m)
print('Female:', sex_f)
print('Male-to-Female Ratio:', round(sex_m/sex_f,2))
print('% of Total, Male:', round(sex_m/len(data),2))
print('% of Total, Female:', round(sex_f/len(data),2))
print('\n')
basic_stats(age_f, title='Female Customer Age Demographics')
print('\n')
basic_stats(age_m, title='Male Customer Age Demographics')

In [None]:
#Check number of people in "target demographic"
len(data.query('Age > 19 and Age < 41'))/len(data.Age) 

In [None]:
#A few quick plots to check customer age distribution
quick_hist(data.Age, title='Customer Age Distribution', y_lbl='Count', x_lbl='Age',bins=20) #Overall
quick_hist(age_f,bins=20,title='Customer Age Distribution: Female', y_lbl='Count', x_lbl='Age') #Female Only
quick_hist(age_m,bins=20,title='Customer Age Distribution: Male', y_lbl='Count', x_lbl='Age') #Male Only