In [31]:
import pandas as pd
import numpy as np
import os

In [7]:
def extract(data_dir, prefix, start_week, end_week):
    """ Extract a temporal slice of data for a given data source.
    
    Parameters
    ----------
    data_dir: str
        Data directory path.
    start_week: int
        First week number (included)
    end_week: int
        Last week number (included)
    prefix: str
        Data source identification (e.g. restaurant_1)
    """
    df = pd.DataFrame()
    
    for i in range(start_week, end_week+1):
        file_path = os.path.join(data_dir, 'data', f'{prefix}_week_{i}.csv')

        if os.path.isfile(file_path):
            batch = pd.read_csv(file_path)
            df = pd.concat([df, batch], sort=True)
    
    return df

def clean(df):
    """Clean dataframe."""
    
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    df['order_date'] = pd.to_datetime(df['order_date'])
    df = df.rename(columns={'order_number': 'order_id'})
    df = df.sort_values('order_date')
    df['total_product_price'] = df['quantity'] * df['product_price']
    df['cash_in'] = df.groupby('order_id')['total_product_price'].transform(np.sum)
    df = df.drop(columns=['item_name', 'quantity', 'product_price', 
                          'total_products', 'total_product_price'],
                errors="ignore")
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    return df

In [38]:
df = extract(data_dir= "/Users/CORENTIN/data-corentinv/tp-intro-mlops",
       prefix="restaurant_1" , start_week=108, end_week=110)

In [39]:
df.head()

Unnamed: 0,Item Name,Order Date,Order Number,Product Price,Quantity,Total products
0,Mango Chutney,2017-01-28 19:14:00,4416,0.5,3,4
1,Mango Chutney,2017-01-28 18:41:00,4414,0.5,1,13
2,Mint Sauce,2017-01-28 18:41:00,4414,0.5,1,13
3,Mango Chutney,2017-01-28 18:28:00,4413,0.5,1,6
4,Mango Chutney,2017-01-28 12:32:00,4402,0.5,1,10


In [41]:
df

Unnamed: 0,order_date,order_id,cash_in
0,2017-01-23 16:54:00,4347,44.65
1,2017-01-23 17:52:00,4348,29.00
2,2017-01-23 18:15:00,4349,26.30
3,2017-01-23 19:11:00,4350,22.70
4,2017-01-23 19:12:00,4351,26.70
...,...,...,...
204,2017-02-12 20:04:00,7613,39.65
205,2017-02-12 20:37:00,7614,34.85
206,2017-02-12 20:51:00,7615,16.35
207,2017-02-12 21:06:00,7616,17.75
