In [1]:
# ! pip install pandas
# ! pip install 'thefuzz[speedup]'

In [2]:
import pandas as pd
from thefuzz import fuzz
from thefuzz import process

In [3]:
df_price = pd.read_csv('seeds/fruit_prices_fact.csv')
df_price

Unnamed: 0,fruit_name,cost
0,banana,1
1,apple,3
2,kiwi,5
3,pear,4
4,kiwi,5
5,pawpaw,7
6,watermelon,12


In [4]:
df_input = pd.read_csv('seeds/fruit_user_input.csv')
df_input

Unnamed: 0,fruit_user_input,quantity,user_name
0,apple,2,doug
1,anana,1,doug
2,kiwi,1,amy
3,a pull,1,amy
4,kiwee,2,anders
5,papaw,2,anders
6,pumpkin,1,anders
7,watrmln,1,ripu
8,apple,5,ripu
9,pere,1,azzam


In [5]:
def custom_scorer(string):
    '''
    for a given string
    return the best match out of the `fruit_name` column in the df_to table
    '''
    
    x = process.extractOne(string,df_price["fruit_name"], score_cutoff=60)
    
    if x is not None:
        return x[0]
    else:
        return None

In [6]:
df_final = (df_input
           # make new col, `fruit_name`, with best match against actual table
           .assign(fruit_name = lambda df: df['fruit_user_input'].apply(custom_scorer))
           # join the actual fruit price table
           .merge(df_price, on="fruit_name")
           # # calculate subtotal
           .assign(total= lambda df: df.quantity * df.cost)
           # # find total for each user and sort descending by total price
           .groupby("user_name")['total'].sum()
           .reset_index()
           .sort_values("total", ascending=False)
          )
df_final

Unnamed: 0,user_name,total
1,anders,34
4,ripu,27
0,amy,10
3,doug,7
2,azzam,4
