In [40]:
import pandas as pd
import numpy as np
import os

In [11]:
df = pd.read_csv('../data/restaurant_1_week_002.csv')
print(df.shape)
df.head()

(21, 6)


Unnamed: 0,Order Number,Order Date,Item Name,Quantity,Product Price,Total products
0,1388,2015-01-10 20:12:00,Onion Chutney,1,0.5,6
1,1388,2015-01-10 20:12:00,Mint Sauce,1,0.5,6
2,1388,2015-01-10 20:12:00,Lime Pickle,1,0.5,6
3,1388,2015-01-10 20:12:00,Paratha,1,2.95,6
4,1387,2015-01-10 16:55:00,Onion Bhaji,1,3.95,1


In [55]:
def extract(
    data_dir: str, 
    prefix: str, 
    start_week: int, 
    end_week: int) -> pd.DataFrame:
    """ Function to load and concat batch data.
    
    Parameters:
    -----------
    data_dir: str
        directory where data is located
    prefix: str
        restaurant name (ex. restaurant_1)
    start_week: int
        start week to load
    end_week: int
        end week to load
    
    Return: 
    df: pd.Dataframe
        Dataframe load
    """
    df = pd.DataFrame()
    for i in range(start_week, end_week+1):
        file_path = os.path.join(data_dir, "data", f'{prefix}_week_{i}.csv')
        if os.path.isfile(file_path):
            batch = pd.read_csv(file_path)
            df = pd.concat([df, batch], sort=True)
    return df

In [56]:
df_rest1 = extract(data_dir = '../', prefix = "restaurant_1", start_week = 108, end_week = 110)
df_rest2= extract(data_dir = '../', prefix = "restaurant_2", start_week = 108, end_week = 110)

In [57]:
def clean(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    df['order_date'] = pd.to_datetime(df['order_date'])
    df = df.rename(columns={'order_number': 'order_id'})
    df = df.sort_values('order_date')
    df['total_product_price'] = df['quantity'] * df['product_price']
    df['cash_in'] = df.groupby('order_id')['total_product_price'].transform(np.sum)
    df = df.drop(columns=['item_name', 'quantity', 'product_price'
                      'total_products', 'total_product_price'],
             errors='ignore'
             )
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    return df

In [58]:
df_rest1_clean = clean(df_rest1)
df_rest2_clean = clean(df_rest2) 

In [60]:
df_rest2_clean

Unnamed: 0,order_date,order_id,product_price,total_products,cash_in
0,2017-01-23 16:01:00,7814,2.95,3,35.75
1,2017-01-23 16:01:00,7814,9.95,3,35.75
2,2017-01-23 17:33:00,7815,4.95,6,37.70
3,2017-01-23 17:33:00,7815,6.95,6,37.70
4,2017-01-23 17:33:00,7815,2.95,6,37.70
...,...,...,...,...,...
1492,2017-02-12 21:40:00,13199,2.50,6,43.25
1493,2017-02-12 21:40:00,13199,5.95,6,43.25
1494,2017-02-12 21:40:00,13199,7.95,6,43.25
1495,2017-02-12 21:40:00,13199,3.95,6,43.25
