In [40]:
import pandas as pd
import numpy as np
import os

In [11]:
df = pd.read_csv('../data/restaurant_1_week_002.csv')
print(df.shape)
df.head()

(21, 6)


Unnamed: 0,Order Number,Order Date,Item Name,Quantity,Product Price,Total products
0,1388,2015-01-10 20:12:00,Onion Chutney,1,0.5,6
1,1388,2015-01-10 20:12:00,Mint Sauce,1,0.5,6
2,1388,2015-01-10 20:12:00,Lime Pickle,1,0.5,6
3,1388,2015-01-10 20:12:00,Paratha,1,2.95,6
4,1387,2015-01-10 16:55:00,Onion Bhaji,1,3.95,1


In [17]:
def extract(
    data_dir: str, 
    prefix: str, 
    start_week: int, 
    end_week: int) -> pd.DataFrame:
    """ Function to load and concat batch data.
    
    Parameters:
    -----------
    data_dir: str
        directory where data is located
    prefix: str
        restaurant name (ex. restaurant_1)
    start_week: int
        start week to load
    end_week: int
        end week to load
    
    Return: 
    df: pd.Dataframe
        Dataframe load
    """
    df = pd.DataFrame()
    for i in range(start_week, end_week+1):
        file_path = os.path.join(data_dir, "data", f'{prefix}_week_{i}.csv')
        if os.path.isfile(file_path):
            batch = pd.read_csv(file_path)
            df = pd.concat([df, batch], sort=True)
    return df

In [46]:
df_rest1 = extract(data_dir = '../', prefix = "restaurant_1", start_week = 108, end_week = 110)
df_rest2= extract(data_dir = '../', prefix = "restaurant_2", start_week = 108, end_week = 110)

In [47]:
df_rest1.columns = df_rest1.columns.str.lower().str.replace(" ", "_")
df_rest1['order_date'] = pd.to_datetime(df_rest1['order_date'])
df_rest1 = df_rest1.rename(columns={'order_number': 'order_id'})
df_rest1 = df_rest1.sort_values('order_date')

In [48]:
df_rest1['total_product_price'] = df_rest1['quantity'] * df_rest1['product_price']
df_rest1['cash_in'] = df_rest1.groupby('order_id')['total_product_price'].transform(np.sum)

In [49]:
df_rest1 = df_rest1.drop(columns=['item_name', 'quantity', 'product_price'
                      'total_products', 'total_product_price'],
             errors='ignore'
             )
df_rest1 = df_rest1.drop_duplicates()
df_rest1 = df_rest1.reset_index(drop=True)
df_rest1

Unnamed: 0,order_date,order_id,product_price,total_products,cash_in
0,2017-01-23 16:54:00,4347,2.95,7,44.65
1,2017-01-23 16:54:00,4347,8.95,7,44.65
2,2017-01-23 16:54:00,4347,3.95,7,44.65
3,2017-01-23 16:54:00,4347,11.95,7,44.65
4,2017-01-23 17:52:00,4348,9.95,6,29.00
...,...,...,...,...,...
879,2017-02-12 21:06:00,7616,8.95,3,17.75
880,2017-02-12 21:06:00,7616,1.95,3,17.75
881,2017-02-12 21:25:00,7617,4.95,4,19.75
882,2017-02-12 21:25:00,7617,1.95,4,19.75
