This notebook is used if we already have a set of labelled data and don't need to generate a train/test set. Instead, we'll divide our labelled data into train/test pieces and save a new file of remaining iems

Expected input: 
- Labelled data

Expected output: 
- train set/ test set csv
- remaining items to sample from

In [13]:
import pandas as pd

In [14]:
# This is the data we have
df = pd.read_csv('redshift_query_results_labelled_entrees.csv')
df.loc[df['Protein']=='Ao', 'Protein']='AO'
df

Unnamed: 0,Orders Pandemic Period,Orders Item Name,Orders Cleansed Tier 1,Orders Cleansed Tier 2,Orders Cleansed Tier 3,Orders Cleansed Tier 4,Orders Avg Cheque,Orders Item Net Sales,Orders Item Quantity,Orders Transactions,Protein
0,2_During Pandemic,Open Food,Food,Entree,Noodle/Grain Bowl,Noodle/Grain Bowl,3.2248,1912987.72,733208,593197,AO
1,2_During Pandemic,Salad Bar,Food,Entree,Salad,Salad,4.0620,1765036.64,435703,434516,AO
2,2_During Pandemic,Beef Cheeseburger,Food,Entree,Burger,Burger,3.2802,1405287.22,427905,428414,Beef
3,2_During Pandemic,Chk Tenders,Food,Entree,Chicken Tenders,Chicken Tenders,2.8701,1075462.59,393789,374703,Chicken
4,2_During Pandemic,Salad Bar By Weight,Food,Entree,Salad,Salad,4.3011,1069387.43,249570,248627,AO
...,...,...,...,...,...,...,...,...,...,...,...
63630,3_Recovery Pandemic,Turkey,Food,Entree,Other Poultry Entree,Other Poultry Entree,-0.1018,-3170.49,31813,31123,Turkey
63631,3_Recovery Pandemic,Italian B.M.T,Food,Entree,Sandwich/Wrap,Sandwich/Wrap,-0.4302,-4834.34,11236,11236,AO
63632,3_Recovery Pandemic,BYO HandKrafted Burger,Food,Entree,Burger,Burger,-1.3697,-7596.46,5556,5546,Beef
63633,3_Recovery Pandemic,Voucher $8,Food,Entree,Other Entree,Other Entree,-8.8302,-8017.90,898,908,AO


In [15]:
# This is the format we need our data in
df_ref = pd.read_csv('classifier/protein_attribution/data/protein_attribution_label_set_round1_n100.csv')
df_ref

Unnamed: 0,tier_1,tier_2,tier_3,tier_4,lineitem_name,sales_amt_gross,label,which_set
0,Food,Entree,Pizza,Pizza,TRA - Pepperoni Pizza,278335.94,,train
1,Food,Entree,Salad,Salad,MEX - Meat Taco Salad,139089.16,,train
2,Food,Entree,Seafood Entree,Seafood Entree,Baked Fish,33076.53,,train
3,Food,Entree,Poultry Entree,Poultry Entree,TERP-GKE-Teriyaki_Plate Chx Spicey,117863.03,,train
4,Food,Entree,Salad,Salad,Chicken Salad Kit,791.70,,train
...,...,...,...,...,...,...,...,...
113,Food,Entree,Tacos,Tacos,pork chile verde tacos (2ea),130.86,,test
114,Food,Entree,Tacos,Tacos,FW  Gringos  3 Tacos,50.00,,test
115,Food,Breakfast,Breakfast Sandwiches,Breakfast Sandwich/Wrap,Egg and Cheddar Biscuit,36.00,,test
116,Food,Entree,Poultry Entree,Poultry Entree,herb roasted halal chicken cobb,132.81,,test


In [16]:
list(df)

[' Orders Pandemic Period',
 ' Orders Item Name',
 ' Orders Cleansed Tier 1',
 ' Orders Cleansed Tier 2',
 ' Orders Cleansed Tier 3',
 ' Orders Cleansed Tier 4',
 ' Orders Avg Cheque',
 ' Orders Item Net Sales',
 ' Orders Item Quantity',
 ' Orders Transactions',
 'Protein']

In [17]:
cols = list(df_ref)

# Map any equivalent columns we can seee
col_maps= {
    ' Orders Item Name' : 'lineitem_name',
    ' Orders Cleansed Tier 1' : 'tier_1',
    ' Orders Cleansed Tier 2' : 'tier_2',
    ' Orders Cleansed Tier 3' : 'tier_3',
    ' Orders Cleansed Tier 4' : 'tier_4',
    ' Orders Item Net Sales': 'sales_amt_gross',
    'Protein': 'label'
}
df.rename(columns=col_maps, inplace=True)

for c in cols:
    if c not in list(df):
        df[c]=''

df = df[cols]
df


Unnamed: 0,tier_1,tier_2,tier_3,tier_4,lineitem_name,sales_amt_gross,label,which_set
0,Food,Entree,Noodle/Grain Bowl,Noodle/Grain Bowl,Open Food,1912987.72,AO,
1,Food,Entree,Salad,Salad,Salad Bar,1765036.64,AO,
2,Food,Entree,Burger,Burger,Beef Cheeseburger,1405287.22,Beef,
3,Food,Entree,Chicken Tenders,Chicken Tenders,Chk Tenders,1075462.59,Chicken,
4,Food,Entree,Salad,Salad,Salad Bar By Weight,1069387.43,AO,
...,...,...,...,...,...,...,...,...
63630,Food,Entree,Other Poultry Entree,Other Poultry Entree,Turkey,-3170.49,Turkey,
63631,Food,Entree,Sandwich/Wrap,Sandwich/Wrap,Italian B.M.T,-4834.34,AO,
63632,Food,Entree,Burger,Burger,BYO HandKrafted Burger,-7596.46,Beef,
63633,Food,Entree,Other Entree,Other Entree,Voucher $8,-8017.90,AO,


In [18]:
# Lets split out test and train datasets

df_ss = df.copy(deep=True)
df_ss = df_ss[['lineitem_name', 'sales_amt_gross']]
df_ss = df_ss[df_ss['sales_amt_gross'] > df_ss['sales_amt_gross'].quantile(.1)] # Generate the test set on the top 90% of data
df_samples = df_ss.sample(frac=0.75, random_state=7)

df_out = df.merge(df_samples['lineitem_name'], on='lineitem_name', how='left', indicator=True)
df_out.loc[df_out['_merge']=='left_only', 'which_set']='test'
df_out.loc[df_out['_merge']=='both', 'which_set']='train'
df_out = df_out[list(df)]

df_out

Unnamed: 0,tier_1,tier_2,tier_3,tier_4,lineitem_name,sales_amt_gross,label,which_set
0,Food,Entree,Noodle/Grain Bowl,Noodle/Grain Bowl,Open Food,1912987.72,AO,train
1,Food,Entree,Salad,Salad,Salad Bar,1765036.64,AO,train
2,Food,Entree,Salad,Salad,Salad Bar,1765036.64,AO,train
3,Food,Entree,Salad,Salad,Salad Bar,1765036.64,AO,train
4,Food,Entree,Burger,Burger,Beef Cheeseburger,1405287.22,Beef,train
...,...,...,...,...,...,...,...,...
109322,Food,Entree,Other Poultry Entree,Other Poultry Entree,Turkey,-3170.49,Turkey,train
109323,Food,Entree,Sandwich/Wrap,Sandwich/Wrap,Italian B.M.T,-4834.34,AO,test
109324,Food,Entree,Burger,Burger,BYO HandKrafted Burger,-7596.46,Beef,test
109325,Food,Entree,Other Entree,Other Entree,Voucher $8,-8017.90,AO,train


In [19]:
df_out.to_csv('classifier/protein_attribution/data/protein_attribution_amy_set_labelled.csv')
# Update remaining items as well