In [None]:
#IMPORT LIBRARY
import pandas as pd
import numpy as np
import json

from bokeh.plotting import figure
from bokeh.palettes import Category20, Set3, Spectral
from bokeh.core.properties import value
from bokeh.io import show, output_file,output_notebook
from bokeh.models import ColumnDataSource, FactorRange, HoverTool, ColumnDataSource, Panel, Tabs,NumeralTickFormatter, Legend 
from bokeh.layouts import gridplot, column, row, WidgetBox, layout
from bokeh.models.widgets import RadioButtonGroup, Dropdown


In [None]:
colorSet = ['#e6194B','#f58231','#ffe119','#bfef45','#aaffc3','#42d4f4','#4363d8', '#000075','#4363d8','#e6beff','#911eb4','#f032e6']

with open('./dat/grocerylist.json') as data_file:    
    grocerydata = json.load(data_file)

with open('./dat/moregrocerylist.json') as data_file2:    
    grocerydata2 = json.load(data_file2)
    
groceryTable = []
names = []
prices = []
itemNames = []
categories = []
dates = []
nameSet = set()
categorySet = set()

for name,receiptList in grocerydata.items():
    nameSet.add(name)
    for date,receipt in receiptList.items():
        for item in receipt:
            itemName = item[0]
            price = item[1]/100
            category = item[2]
            if category is not None:
                category = category.lower()
                category = category.capitalize()
            names.append(name)
            prices.append(price)
            itemNames.append(itemName)
            categories.append(category)
            categorySet.add(category)
            dates.append(date)
            

for name,receiptList in grocerydata2.items():
    nameSet.add(name)
    for date,receipt in receiptList.items():
        for item in receipt:
            if(date in dates):
                continue
            else:
                itemName = item[0]
                price = item[1]/100
                category = item[2]
                if category is not None:
                    category = category.lower()
                    category = category.capitalize()
                names.append(name)
                prices.append(price)
                itemNames.append(itemName)
                categories.append(category)
                categorySet.add(category)
                dates.append(date)


groceryDict = {'Name' : names, 'Item Name' : itemNames, 'Price': prices, 'Category' :categories, 'Date': dates}


#create dataframe from groceryTable
grocery_df = pd.DataFrame(groceryDict)
num_rows = grocery_df.shape[0]
print(num_rows)
grocery_df['Date']= pd.to_datetime(grocery_df['Date']) 
grocery_df['Category'] = grocery_df['Category'].replace('', 'None').replace('Gen merchandise', 'Miscellaneous').replace('Groc nonedible', 'Miscellaneous').replace('Refrig/frozen', 'Refrig/Frozen').replace('-refrig/frozen', 'Refrig/Frozen')
grocery_df['Month'] = grocery_df['Date'].dt.to_period('M')
grocery_df.fillna(value='None', inplace=True)
grocery_df['Match'] = [0]*num_rows
grocery_df['Match Val'] = [0]*num_rows
#print(nameSet)
#print(categorySet)
pd.set_option('display.max_rows', None)

#grocery_df





In [None]:
from fuzzywuzzy import fuzz

ratio_list = []
index_list = []

for index, row in grocery_df.iterrows():
    print(index, end=', ')
    item_name = row['Item Name']
    best_ratio = -1
    best_idx = -1
    for sub_index, sub_row in grocery_df.iterrows():
        if sub_index == index:
            continue
        sub_item = sub_row['Item Name']
        #try to find item_name in each sub_item
        ratio = fuzz.partial_ratio(item_name, sub_item)
        if ratio > best_ratio: best_ratio = ratio; best_idx = sub_index
    ratio_list.append(best_ratio)
    index_list.append(best_idx)
    
print(num_rows)
grocery_df['Match Val'] = ratio_list
grocery_df['Match'] = index_list
grocery_df

    

In [None]:
name_list = []
for index, row in grocery_df.iterrows():
    #print(index, end=', ')
    item_name = row['Item Name']
    match_idx = row['Match']
    match_val = row['Match Val']
    other_name = grocery_df.loc[match_idx, 'Item Name']
    other_ratio = grocery_df.loc[match_idx, 'Match Val']
    other_idx = grocery_df.loc[match_idx, 'Match']
    #::: threshold value :::
    if match_val < 75:
        name_list.append(item_name)
        continue

    if other_idx == index:
        if index < match_idx:
           #print("1mirror!")
           name_list.append(item_name)
        else:
           #print("2mirror!")
           name_list.append(other_name)
           #print(match_val, match_idx, item_name, "\n\tvs",other_ratio , other_idx, other_name)
        continue

    if match_val == 100:
        name_list.append(item_name)
        continue

    #print(match_val, match_idx, item_name, "\n\tvs",other_ratio , other_idx, other_name)

    if other_ratio >= match_val:
        name_list.append(other_name)
        continue
    else:
        name_list.append('%%%%%%%')

grocery_df['Item Name'] = name_list
#print(grocery_df.shape[0], grocery_df['Item Name'].nunique())
grocery_df

In [None]:
#Run to get an idea of the FuzzyWuzzy Ratios going on
r = list(range(len(grocery_df['Item Name'])))
sortedList = ratio_list
sortedList.sort(reverse = True)
print(sortedList)
count = 0 
         
for num in r:
    r[count] = str(num)
    count += 1
#grocery_df.sort_values(by=['Match Val'])

p = figure(x_range= r ,  plot_width=1000, y_axis_label = "Ratio")
p.vbar(x= r, top = sortedList , width=0.9)
show(p)

In [None]:
perPersonDict = dict()
for index, row in grocery_df.iterrows():
    rowList = [row['Item Name'], row['Price'], row['Category'], row['Date'], row['Month'].strftime('%b-%Y')]
#     print(rowList)
    if row['Name'] in perPersonDict:
        perPersonDict[row['Name']].append(rowList)
    else:
        perPersonDict[row['Name']] = [rowList]
        
perPersonCategorySpending = dict()
perPersonTopTenDict = dict()
# for person,receiptItems in perPersonDict.items():
#     print(person)
#     print(receiptItems)

#Initialize dicts
for name in nameSet:
    perPersonCategorySpending[name] = dict()
    perPersonTopTenDict[name] = dict();
        
# Frequency and totalSpending
for person,receiptItems in perPersonDict.items():
    for item in receiptItems:
        if item[0] in perPersonTopTenDict[person]:
            perPersonTopTenDict[person][item[0]][0] += 1
            perPersonTopTenDict[person][item[0]][1] += item[1]
        else:
            perPersonTopTenDict[person][item[0]] = [1, item[1]]

cleanCategorySet = set()
# Category
for person,receiptItems in perPersonDict.items():
    for item in receiptItems:
        if item[4] in perPersonCategorySpending[person]:
            if item[2] in perPersonCategorySpending[person][item[4]]:
                perPersonCategorySpending[person][item[4]][item[2]] += item[1]
            else:
                cleanCategorySet.add(item[2])
                perPersonCategorySpending[person][item[4]][item[2]] = item[1]
            
        else:
            perPersonCategorySpending[person][item[4]] = dict()
            perPersonCategorySpending[person][item[4]][item[2]] = item[1]
            

# Agg Category
perPersonCategoryTotalSpending = dict()
for person, perMonth in perPersonCategorySpending.items():
    perPersonCategoryTotalSpending[person] = dict()
    for date, Categories in perMonth.items():
        for cate,spent in Categories.items():
            if cate in perPersonCategoryTotalSpending[person] :
                perPersonCategoryTotalSpending[person][cate] += spent
            else:
                perPersonCategoryTotalSpending[person][cate] = spent
        
            
# print(perPersonTopTenDict)
# print(perPersonCategorySpending)
# print(perPersonCategoryTotalSpending)

In [None]:
#output_file("grocery_receipt_dashboard.html")

nameList = list(nameSet)
cleanCategoryList = list(cleanCategorySet)
reformatDict = { i : [0]*len(nameList) for i in cleanCategoryList }
i = 0

totals = [0]*len(nameList)
totalDict = dict()
for name, cateDict in perPersonCategoryTotalSpending.items():
    for ca in cateDict.items():
        if ca[0] in reformatDict:
            reformatDict[ca[0]][i] = ca[1]
            totals[i] += ca[1]
        if ca[0] in totalDict:
            totalDict[ca[0]][0] += ca[1]
        else:
            totalDict[ca[0]] = [ca[1]]
    i += 1
    
i = 0
percentTotal = [0]*len(nameList)
percentTotalDict = dict()
percentDict = { i : [0]*len(nameList) for i in cleanCategoryList }
for name, cateDict in perPersonCategoryTotalSpending.items():
    for ca in cateDict.items():
        if ca[0] in percentDict:
            percentDict[ca[0]][i] = ca[1]/totals[i]
            percentTotal[i] += ca[1]/totals[i]
        if ca[0] in percentTotalDict:
            percentTotalDict[ca[0]][0] += ca[1]
        else:
            percentTotalDict[ca[0]] = [ca[1]]
    i += 1

# print(reformatDict)
# print(percentDict)
reformatDict['Name'] = nameList
percentDict['Name'] = nameList
reformatedData = []
percentreformatedData = []
for person,cateList in reformatDict.items():
    reformatedData.append(cateList)
    
for person,cateList in percentDict.items():
    percentreformatedData.append(cateList)

#print(reformatedData)
#print(percentreformatedData)
# print(len(reformatedData))
# print(len(percentreformatedData))

In [None]:
#colors = Category20[len(cleanCategoryList)]
output_file("total_group_barchart.html")
#colors = colorSet
#colors = Set3[len(cleanCategoryList)]
colors = []
colors = Spectral[11].copy()
colors.insert(0,'#2E0854')
print(totalDict)

totalDict['Household'] = ['Household']
#sources = ColumnDataSource( = perPersonData)
tooltips = []
for cate in cleanCategoryList:
    at = "$@{"+cate+"}{0.2f}"
    tooltips.insert(0,(cate, at))
        
p1 = figure(x_range=["Household"], plot_width=600, plot_height=500, title = "Total Group Spending", tooltips=tooltips)

v = p1.vbar_stack(cleanCategoryList, x='Household', width=0.9, source=totalDict, color=colors)


legend_it = []
for name, vitem in zip(cleanCategoryList, v):
    legend_it.insert(0,(name,[vitem]))

legend = Legend(items=legend_it, location=(0, 100))
legend.click_policy="mute"

p1.add_layout(legend, 'left')

tab1 = Panel(child= p1, title="Spending Per Category")


# colors = Set3[len(cleanCategoryList)]
p2 = figure(x_range=nameList, plot_width=600, plot_height=500, title = "Total Spending Per Person", tooltips=tooltips)
v2 = p2.vbar_stack(cleanCategoryList, x='Name', width=0.9, source=reformatDict, color=colors)

legend_it2 = []
for name, vitem in zip(cleanCategoryList, v2):
    legend_it2.insert(0,(name,[vitem]))

legend2 = Legend(items=legend_it2, location=(0, 100))
legend2.click_policy="mute"

p2.add_layout(legend2, 'left')

tab2 = Panel(child= p2, title="Spending Per Category")

#By percent Graph

tooltips = []
for cate in cleanCategoryList:
    at = "@{"+cate+"}{0.1f%}"
    tooltips.insert(0,(cate, at))
p3 = figure(x_range=nameList, plot_width=600, plot_height=500, title = "Percent Allocated Per Category",tooltips=tooltips)

v3 = p3.vbar_stack(cleanCategoryList, x='Name', width=0.9, source=percentDict, color=colors)
p3.yaxis.formatter = NumeralTickFormatter(format='0%')

legend_it3 = []
for name, vitem in zip(cleanCategoryList, v3):
    legend_it3.insert(0,(name,[vitem]))
    
legend3 = Legend(items=legend_it3, location=(0, 100))
legend3.click_policy="mute"

p3.add_layout(legend3, 'left')

tab3 = Panel(child=p3, title="Category by Percent")

tabs = Tabs(tabs=[ tab1, tab2, tab3 ])

show(tabs)

In [None]:
output_file("grocery_monthly_per_person.html")

#colors = Category20[len(cleanCategoryList)]
#colors = Set3[len(cleanCategoryList)]
colors = []
colors = Spectral[11].copy()
colors.insert(0,'#2E0854')
# colors.insert(0,'#8b0000')

print(colors)
factors=[]

for name,monthPart in perPersonCategorySpending.items():
    for month, ca in monthPart.items():   
        factors.append((name, month))
# print(factors)

perPersonFactors = dict()
for tup in factors:
    #print(tup)
    if tup[0] in perPersonFactors:
        perPersonFactors[tup[0]].append(tup)
    else:
        perPersonFactors[tup[0]] = [tup]
        
# print(factors)
# print(perPersonFactors)
        
perPersonLists = dict()
for name,perMonth in perPersonCategorySpending.items():
    #Initilize lists for each 
    tlen = len(perPersonFactors[name]) 
    perPersonLists[name] = {c : [0] * tlen for c in cleanCategoryList}   
    i = 0
    for date, Categories in perMonth.items():
        for cate,spent in Categories.items():
            perPersonLists[name][cate][i] = spent
        i += 1
        
i = 0   
figureListMonths = []
figureListTop = []
tablist = []

print(perPersonLists)
for name, item in perPersonLists.items():
    wow = []
    for tup in perPersonFactors[name]:
        wow.append(tup[1])
    perPersonData = perPersonLists[name]
    perPersonData['x'] = wow

    sources = ColumnDataSource(data = perPersonData)

    tooltips = []
    for cate in cleanCategoryList:
        at = "$@{"+cate+"}{0.2f}"
        tooltips.insert(0,(cate, at))
        
    figureListMonths.append(figure(x_range=wow,  plot_width=600, plot_height=500, title = "By Category Spending Habits", y_axis_label = 'Money Spent($)', tooltips=tooltips))
    v = figureListMonths[i].vbar_stack(cleanCategoryList, x='x', width=0.9, source=sources, color=colors)
     
    figureListMonths[i].xaxis.major_label_orientation = 20

    legend_it = []

    for catename, vitem in zip(cleanCategoryList, v):
        legend_it.insert(0,(catename,[vitem]))

    legend = Legend(items=legend_it, location=(0, 100))
    legend.click_policy="mute"

    figureListMonths[i].add_layout(legend, 'left')
    
    
    tablist.append(Panel(child= figureListMonths[i], title= name))
    

    figureListTop.append(figure(x_range=wow,  plot_width=600, plot_height=500, title = "By Category Spending Habits", y_axis_label = 'Money Spent($)'))
    figureListTop[i].vbar_stack(cleanCategoryList, x='x', width=0.9, source=sources, color=colors)
    
    i += 1



tabs = Tabs(tabs= tablist)

show(tabs)

In [None]:
output_file("grocery_top_ten_per_person.html")
#Graphing The Top Ten
# print(perPersonTopTenDict)

newDict = dict()
newDictSorted = dict()
for name, itemList in perPersonTopTenDict.items(): 
    newDict[name] = []
    newDictSorted[name] = []
#     valList = [(Values) for itemName, Values in itemList.items()]
#     print(valList)
    for itemName, infoList in itemList.items():
        print(itemName)
        itemName = itemName.strip()
        if (itemName == "TAX" or itemName == "CRV" or "DEMP" in itemName):
            continue
        else:
            newDict[name].append((itemName, infoList))
            
    newDictSorted[name] = sorted(newDict[name], key = lambda x: x[1][0], reverse = True)
       
# print(newDictSorted)

topTenDictSorted = dict()

for name,items in newDictSorted.items():
    topTenDictSorted[name] = []
    count = 0
    for item in items:
        if(count < 10):
            topTenDictSorted[name].append(item)
            count+=1

print(topTenDictSorted)


figureListTop = []
tablistTop = []

i=0
for name, items in topTenDictSorted.items():
    itemNameList = []
    costList = []
    freqList = []
    for item in items:
        itemNameList.append(item[0])
        costList.append(item[1][1])
        freqList.append(item[1][0])
        
    source = ColumnDataSource(data=dict(x=itemNameList, s=costList, f = freqList))

    tooltips = [('Money Spent', '@s{$0.2f}'), ('# of Times Purchased', '@f')]
 

    figureListTop.append(figure(x_range=itemNameList,  plot_width=600, plot_height=500, title = "Top Ten Purchased Items", y_axis_label = 'Money Spent($)', tooltips=tooltips))
    figureListTop[i].vbar(x='x', top ='s', width=0.9, source=source)
    
    figureListTop[i].xaxis.major_label_orientation = 20

    tablistTop.append(Panel(child= figureListTop[i], title= name))
    i += 1

tabs = Tabs(tabs= tablistTop)

show(tabs)

