In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
bakery_data = pd.read_csv('../data/raw_data/french_bakery_sales.csv')
display(bakery_data)

Unnamed: 0.1,Unnamed: 0,date,time,ticket_number,article,Quantity,unit_price
0,0,2021-01-02,08:38,150040.0,BAGUETTE,1.0,"0,90 €"
1,1,2021-01-02,08:38,150040.0,PAIN AU CHOCOLAT,3.0,"1,20 €"
2,4,2021-01-02,09:14,150041.0,PAIN AU CHOCOLAT,2.0,"1,20 €"
3,5,2021-01-02,09:14,150041.0,PAIN,1.0,"1,15 €"
4,8,2021-01-02,09:25,150042.0,TRADITIONAL BAGUETTE,5.0,"1,20 €"
...,...,...,...,...,...,...,...
234000,511387,2022-09-30,18:52,288911.0,COUPE,1.0,"0,15 €"
234001,511388,2022-09-30,18:52,288911.0,BOULE 200G,1.0,"1,20 €"
234002,511389,2022-09-30,18:52,288911.0,COUPE,2.0,"0,15 €"
234003,511392,2022-09-30,18:55,288912.0,TRADITIONAL BAGUETTE,1.0,"1,30 €"


In [3]:
def format_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates a Copy of the Original DataFrame. Formats the DataFrame column names to lowercase and formats the 'Item' column values to lowercase
    with underscores instead of spaces.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The modified DataFrame with lowercase column names and formatted 'Item' values.
    """
    # Create a copy of the original DataFrame
    formatted_df = df.copy()

    # Rename columns to lowercase
    formatted_df.columns = formatted_df.columns.str.lower()

    # Format 'Item' column values to lowercase with underscores
    formatted_df['article'] = formatted_df['article'].str.lower().str.replace(' ', '_')

    return formatted_df

In [4]:
format_dataframe(bakery_data)

Unnamed: 0,unnamed: 0,date,time,ticket_number,article,quantity,unit_price
0,0,2021-01-02,08:38,150040.0,baguette,1.0,"0,90 €"
1,1,2021-01-02,08:38,150040.0,pain_au_chocolat,3.0,"1,20 €"
2,4,2021-01-02,09:14,150041.0,pain_au_chocolat,2.0,"1,20 €"
3,5,2021-01-02,09:14,150041.0,pain,1.0,"1,15 €"
4,8,2021-01-02,09:25,150042.0,traditional_baguette,5.0,"1,20 €"
...,...,...,...,...,...,...,...
234000,511387,2022-09-30,18:52,288911.0,coupe,1.0,"0,15 €"
234001,511388,2022-09-30,18:52,288911.0,boule_200g,1.0,"1,20 €"
234002,511389,2022-09-30,18:52,288911.0,coupe,2.0,"0,15 €"
234003,511392,2022-09-30,18:55,288912.0,traditional_baguette,1.0,"1,30 €"


In [5]:
bakery_data = format_dataframe(bakery_data)
bakery_data

Unnamed: 0,unnamed: 0,date,time,ticket_number,article,quantity,unit_price
0,0,2021-01-02,08:38,150040.0,baguette,1.0,"0,90 €"
1,1,2021-01-02,08:38,150040.0,pain_au_chocolat,3.0,"1,20 €"
2,4,2021-01-02,09:14,150041.0,pain_au_chocolat,2.0,"1,20 €"
3,5,2021-01-02,09:14,150041.0,pain,1.0,"1,15 €"
4,8,2021-01-02,09:25,150042.0,traditional_baguette,5.0,"1,20 €"
...,...,...,...,...,...,...,...
234000,511387,2022-09-30,18:52,288911.0,coupe,1.0,"0,15 €"
234001,511388,2022-09-30,18:52,288911.0,boule_200g,1.0,"1,20 €"
234002,511389,2022-09-30,18:52,288911.0,coupe,2.0,"0,15 €"
234003,511392,2022-09-30,18:55,288912.0,traditional_baguette,1.0,"1,30 €"


In [6]:
bakery_data['article'].unique()

array(['baguette', 'pain_au_chocolat', 'pain', 'traditional_baguette',
       'croissant', 'banette', 'banettine', 'special_bread', 'coupe',
       'sand_jb_emmental', 'kouign_amann', 'boule_200g', 'boule_400g',
       'gal_frangipane_6p', 'campagne', 'moisson', 'cafe_ou_eau',
       'brioche', 'cereal_baguette', 'seigle', 'complet',
       'divers_patisserie', 'gal_frangipane_4p', 'cookie', 'ficelle',
       'pain_aux_raisins', 'gal_pomme_6p', 'gal_pomme_4p', 'financier_x5',
       'vik_bread', 'divers_viennoiserie', 'gache', 'sandwich_complet',
       'pain_banette', 'grand_far_breton', 'quim_bread',
       'special_bread_kg', 'gd_kouign_amann', 'boule_polka',
       'demi_baguette', 'chausson_aux_pommes', 'baguette_graine',
       'divers_confiserie', 'sucette', 'divers_boulangerie',
       'boisson_33cl', 'pates', 'formule_sandwich', 'divers_sandwichs',
       'croissant_amandes', 'pain_choco_amandes', 'sachet_viennoiserie',
       'nantais', 'chocolat', 'pain_s/sel', 'fondant_choc

In [7]:
def drop_values_from_column(df: pd.DataFrame, column: str, values: list) -> pd.DataFrame:
    """
    Drops specified values from a given column in the DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column (str): The name of the column from which values need to be dropped.
        values (list): A list of values to be dropped from the column.

    Returns:
        pd.DataFrame: The modified DataFrame with dropped values from the specified column.
    """
    # Create a copy of the original DataFrame
    modified_df = df.copy()

    # Drop rows containing specified values from the column
    modified_df = modified_df[modified_df[column].isin(values) == False].reset_index(drop=True)

    return modified_df

In [8]:
values_to_drop = ['coupe', 'traiteur', 'the']

In [9]:
modified_bakery_data = drop_values_from_column(bakery_data, 'article', values_to_drop)
display(modified_bakery_data)

Unnamed: 0,unnamed: 0,date,time,ticket_number,article,quantity,unit_price
0,0,2021-01-02,08:38,150040.0,baguette,1.0,"0,90 €"
1,1,2021-01-02,08:38,150040.0,pain_au_chocolat,3.0,"1,20 €"
2,4,2021-01-02,09:14,150041.0,pain_au_chocolat,2.0,"1,20 €"
3,5,2021-01-02,09:14,150041.0,pain,1.0,"1,15 €"
4,8,2021-01-02,09:25,150042.0,traditional_baguette,5.0,"1,20 €"
...,...,...,...,...,...,...,...
212215,511385,2022-09-30,18:52,288911.0,campagne,2.0,"2,00 €"
212216,511386,2022-09-30,18:52,288911.0,traditional_baguette,5.0,"1,30 €"
212217,511388,2022-09-30,18:52,288911.0,boule_200g,1.0,"1,20 €"
212218,511392,2022-09-30,18:55,288912.0,traditional_baguette,1.0,"1,30 €"


In [10]:
bakery_data = modified_bakery_data
display(bakery_data)

Unnamed: 0,unnamed: 0,date,time,ticket_number,article,quantity,unit_price
0,0,2021-01-02,08:38,150040.0,baguette,1.0,"0,90 €"
1,1,2021-01-02,08:38,150040.0,pain_au_chocolat,3.0,"1,20 €"
2,4,2021-01-02,09:14,150041.0,pain_au_chocolat,2.0,"1,20 €"
3,5,2021-01-02,09:14,150041.0,pain,1.0,"1,15 €"
4,8,2021-01-02,09:25,150042.0,traditional_baguette,5.0,"1,20 €"
...,...,...,...,...,...,...,...
212215,511385,2022-09-30,18:52,288911.0,campagne,2.0,"2,00 €"
212216,511386,2022-09-30,18:52,288911.0,traditional_baguette,5.0,"1,30 €"
212217,511388,2022-09-30,18:52,288911.0,boule_200g,1.0,"1,20 €"
212218,511392,2022-09-30,18:55,288912.0,traditional_baguette,1.0,"1,30 €"


In [11]:
bakery_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212220 entries, 0 to 212219
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   unnamed: 0     212220 non-null  int64  
 1   date           212220 non-null  object 
 2   time           212220 non-null  object 
 3   ticket_number  212220 non-null  float64
 4   article        212220 non-null  object 
 5   quantity       212220 non-null  float64
 6   unit_price     212220 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 11.3+ MB


In [12]:
bakery_data.drop('unnamed: 0', axis=1, inplace=True)

In [13]:
bakery_data = bakery_data[~bakery_data.article.str.contains("divers")]

In [14]:
bakery_data['article'].unique()

array(['baguette', 'pain_au_chocolat', 'pain', 'traditional_baguette',
       'croissant', 'banette', 'banettine', 'special_bread',
       'sand_jb_emmental', 'kouign_amann', 'boule_200g', 'boule_400g',
       'gal_frangipane_6p', 'campagne', 'moisson', 'cafe_ou_eau',
       'brioche', 'cereal_baguette', 'seigle', 'complet',
       'gal_frangipane_4p', 'cookie', 'ficelle', 'pain_aux_raisins',
       'gal_pomme_6p', 'gal_pomme_4p', 'financier_x5', 'vik_bread',
       'gache', 'sandwich_complet', 'pain_banette', 'grand_far_breton',
       'quim_bread', 'special_bread_kg', 'gd_kouign_amann', 'boule_polka',
       'demi_baguette', 'chausson_aux_pommes', 'baguette_graine',
       'sucette', 'boisson_33cl', 'pates', 'formule_sandwich',
       'croissant_amandes', 'pain_choco_amandes', 'sachet_viennoiserie',
       'nantais', 'chocolat', 'pain_s/sel', 'fondant_chocolat',
       'gal_poire_choco_6p', 'gal_poire_choco_4p', 'galette_8_pers',
       'sand_jb', 'sachet_de_crouton', 'grande_sucette

In [15]:
bakery_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 209771 entries, 0 to 212219
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   date           209771 non-null  object 
 1   time           209771 non-null  object 
 2   ticket_number  209771 non-null  float64
 3   article        209771 non-null  object 
 4   quantity       209771 non-null  float64
 5   unit_price     209771 non-null  object 
dtypes: float64(2), object(4)
memory usage: 11.2+ MB


In [16]:
#putting ticket_number as first column
column_to_move = bakery_data['ticket_number']

bakery_data = bakery_data.drop('ticket_number', axis=1)

bakery_data.insert(0, 'ticket_number', column_to_move)

display(bakery_data)

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price
0,150040.0,2021-01-02,08:38,baguette,1.0,"0,90 €"
1,150040.0,2021-01-02,08:38,pain_au_chocolat,3.0,"1,20 €"
2,150041.0,2021-01-02,09:14,pain_au_chocolat,2.0,"1,20 €"
3,150041.0,2021-01-02,09:14,pain,1.0,"1,15 €"
4,150042.0,2021-01-02,09:25,traditional_baguette,5.0,"1,20 €"
...,...,...,...,...,...,...
212215,288911.0,2022-09-30,18:52,campagne,2.0,"2,00 €"
212216,288911.0,2022-09-30,18:52,traditional_baguette,5.0,"1,30 €"
212217,288911.0,2022-09-30,18:52,boule_200g,1.0,"1,20 €"
212218,288912.0,2022-09-30,18:55,traditional_baguette,1.0,"1,30 €"


In [17]:
bakery_data.info()
bakery_data.describe(include=object).T

<class 'pandas.core.frame.DataFrame'>
Index: 209771 entries, 0 to 212219
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ticket_number  209771 non-null  float64
 1   date           209771 non-null  object 
 2   time           209771 non-null  object 
 3   article        209771 non-null  object 
 4   quantity       209771 non-null  float64
 5   unit_price     209771 non-null  object 
dtypes: float64(2), object(4)
memory usage: 11.2+ MB


Unnamed: 0,count,unique,top,freq
date,209771,600,2022-08-14,921
time,209771,683,12:11,774
article,209771,140,traditional_baguette,67689
unit_price,209771,60,"1,20 €",49056


In [18]:
def convert_column_to_float(df: pd.DataFrame, column_name: str, symbol_to_remove: str) -> pd.DataFrame:
    """
    Convert a column in a DataFrame from object to float64 type,
    remove a specified symbol, and substitute ',' with '.' in its values.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the column.
        column_name (str): The name of the column to convert.
        symbol_to_remove (str): The symbol to remove from the column values.
    
    Returns:
        pd.DataFrame: The modified DataFrame with the specified column converted to float64 type,
                      the symbol removed from its values, and ',' substituted with '.'.
    """
    # Create a copy of the original DataFrame
    modified_df = df.copy()
    
    # Remove specified symbol from the column values in the copied DataFrame
    modified_df[column_name] = modified_df[column_name].str.replace(symbol_to_remove, '')
    
    # Substitute ',' with '.' in the column values in the copied DataFrame
    modified_df[column_name] = modified_df[column_name].str.replace(',', '.')
    
    # Convert the copied column to float64 type in the copied DataFrame
    modified_df[column_name] = modified_df[column_name].astype('float64')
    
    return modified_df


In [19]:
converted_bakery_data = convert_column_to_float(bakery_data, 'unit_price', '€')

In [20]:
display(converted_bakery_data)

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price
0,150040.0,2021-01-02,08:38,baguette,1.0,0.90
1,150040.0,2021-01-02,08:38,pain_au_chocolat,3.0,1.20
2,150041.0,2021-01-02,09:14,pain_au_chocolat,2.0,1.20
3,150041.0,2021-01-02,09:14,pain,1.0,1.15
4,150042.0,2021-01-02,09:25,traditional_baguette,5.0,1.20
...,...,...,...,...,...,...
212215,288911.0,2022-09-30,18:52,campagne,2.0,2.00
212216,288911.0,2022-09-30,18:52,traditional_baguette,5.0,1.30
212217,288911.0,2022-09-30,18:52,boule_200g,1.0,1.20
212218,288912.0,2022-09-30,18:55,traditional_baguette,1.0,1.30


In [21]:
bakery_data = converted_bakery_data

In [22]:
bakery_data.info()
round(bakery_data.describe(),2)

<class 'pandas.core.frame.DataFrame'>
Index: 209771 entries, 0 to 212219
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ticket_number  209771 non-null  float64
 1   date           209771 non-null  object 
 2   time           209771 non-null  object 
 3   article        209771 non-null  object 
 4   quantity       209771 non-null  float64
 5   unit_price     209771 non-null  float64
dtypes: float64(3), object(3)
memory usage: 11.2+ MB


Unnamed: 0,ticket_number,quantity,unit_price
count,209771.0,209771.0,209771.0
mean,219400.66,1.59,1.74
std,40110.13,1.34,1.58
min,150040.0,-200.0,0.0
25%,184810.5,1.0,1.15
50%,218886.0,1.0,1.25
75%,254371.5,2.0,1.5
max,288913.0,200.0,35.0


In [23]:
bakery_data.describe(include=object)

Unnamed: 0,date,time,article
count,209771,209771,209771
unique,600,683,140
top,2022-08-14,12:11,traditional_baguette
freq,921,774,67689


In [24]:
bakery_data = bakery_data.drop(bakery_data.index[bakery_data['quantity'] < 0])

In [25]:
round(bakery_data.describe(),2)

Unnamed: 0,ticket_number,quantity,unit_price
count,208589.0,208589.0,208589.0
mean,219367.46,1.6,1.73
std,40126.06,1.24,1.58
min,150040.0,1.0,0.0
25%,184773.0,1.0,1.15
50%,218738.0,1.0,1.25
75%,254418.0,2.0,1.5
max,288913.0,200.0,35.0


In [26]:
bakery_data.describe(include=object)

Unnamed: 0,date,time,article
count,208589,208589,208589
unique,600,683,140
top,2022-08-14,12:11,traditional_baguette
freq,921,771,67349


In [27]:
bakery_data['unit_price'].unique()

array([ 0.9 ,  1.2 ,  1.15,  1.1 ,  1.05,  0.6 ,  2.4 ,  3.5 ,  2.1 ,
        1.5 , 12.  ,  1.8 ,  2.  ,  1.  ,  5.  ,  1.25,  8.  ,  1.4 ,
        3.  ,  2.5 ,  4.5 ,  7.  ,  4.8 ,  7.5 ,  1.6 ,  0.45,  1.3 ,
        0.3 ,  6.5 ,  4.  , 16.  ,  9.  ,  0.8 , 18.  ,  0.  ,  0.7 ,
        5.5 ,  6.  , 11.  , 14.  , 21.  , 28.  , 10.  , 15.  ,  0.95,
        0.5 ,  0.65,  2.2 ,  1.9 ,  1.7 ,  2.6 ,  1.35,  7.6 ,  4.9 ,
        8.3 ,  2.3 ,  5.2 ,  2.7 ,  9.1 , 35.  ])

In [28]:
ordered_by_price_data = bakery_data.copy()

In [29]:
display(ordered_by_price_data.sort_values('unit_price'))

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price
56272,186662.0,2021-07-10,13:25,.,2.0,0.0
141623,242598.0,2022-04-11,12:37,gd_far_breton,1.0,0.0
22643,164878.0,2021-04-04,09:53,.,1.0,0.0
14415,159538.0,2021-03-05,18:18,gd_far_breton,1.0,0.0
18027,161853.0,2021-03-18,12:59,.,1.0,0.0
...,...,...,...,...,...,...
193612,276727.0,2022-08-13,12:58,tarte_fraise_6p,1.0,21.0
200517,281067.0,2022-08-24,10:03,royal_6p,1.0,21.0
172618,263323.0,2022-07-10,09:36,tarte_fraise_6p,1.0,21.0
116387,225766.0,2021-12-24,11:10,buche_8pers,2.0,28.0


In [30]:
df_filtered = ordered_by_price_data[ordered_by_price_data['unit_price'] >= 0.08]

In [31]:
display(df_filtered.sort_values('unit_price').head(50))

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price
93747,210581.0,2021-09-22,16:37,sucette,1.0,0.3
52759,184390.0,2021-07-01,17:21,sucette,7.0,0.3
52757,184389.0,2021-07-01,17:20,sucette,7.0,0.3
52686,184337.0,2021-07-01,12:05,sucette,2.0,0.3
107654,219858.0,2021-11-09,17:58,sucette,2.0,0.3
52502,184210.0,2021-07-01,08:07,sucette,1.0,0.3
197801,279369.0,2022-08-20,07:38,sucette,1.0,0.3
197779,279351.0,2022-08-19,18:15,sucette,2.0,0.3
52393,184134.0,2021-06-30,12:41,sucette,1.0,0.3
162682,256497.0,2022-06-11,11:29,sucette,1.0,0.3


In [32]:
bakery_data = df_filtered

In [33]:
bakery_data.info()
round(bakery_data.describe(),2).T

<class 'pandas.core.frame.DataFrame'>
Index: 208578 entries, 0 to 212219
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ticket_number  208578 non-null  float64
 1   date           208578 non-null  object 
 2   time           208578 non-null  object 
 3   article        208578 non-null  object 
 4   quantity       208578 non-null  float64
 5   unit_price     208578 non-null  float64
dtypes: float64(3), object(3)
memory usage: 11.1+ MB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ticket_number,208578.0,219368.68,40125.86,150040.0,184774.0,218739.5,254419.75,288913.0
quantity,208578.0,1.6,1.24,1.0,1.0,1.0,2.0,200.0
unit_price,208578.0,1.73,1.58,0.3,1.15,1.25,1.5,35.0


In [34]:
bakery_data.describe(include=object)

Unnamed: 0,date,time,article
count,208578,208578,208578
unique,600,683,138
top,2022-08-14,12:11,traditional_baguette
freq,921,771,67349


In [35]:
bakery_data_table = bakery_data.copy()

In [36]:
bakery_price_table = bakery_data_table[['article','unit_price']]
bakery_price_table = bakery_price_table.drop_duplicates()

In [37]:
bakery_price_table = bakery_price_table.reset_index(drop=True)

In [38]:
bakery_price_table = bakery_price_table.sort_values(by=['article'], ignore_index=True)
bakery_price_table = bakery_price_table.drop_duplicates(subset='article', keep="last").reset_index(drop=True)

In [39]:
display(bakery_price_table)

Unnamed: 0,article,unit_price
0,12_macaron,10.0
1,armoricain,2.5
2,baguette,1.0
3,baguette_apero,4.5
4,baguette_graine,1.3
...,...,...
133,tropezienne,2.0
134,tropezienne_framboise,2.2
135,tulipe,3.5
136,viennoise,1.1


In [40]:
bakery_price_table.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
unit_price,138.0,4.944565,5.494927,0.3,1.525,2.5,6.5,35.0


In [41]:
merged_df = bakery_data.merge(bakery_price_table, on='article', how='left')
merged_df['unit_price_x'].fillna(merged_df['unit_price_y'], inplace=True)
merged_df.drop(['unit_price_x'], axis=1, inplace=True)
merged_df.rename(columns={'unit_price_y': 'unit_price'}, inplace=True)

display(merged_df)

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price
0,150040.0,2021-01-02,08:38,baguette,1.0,1.00
1,150040.0,2021-01-02,08:38,pain_au_chocolat,3.0,1.20
2,150041.0,2021-01-02,09:14,pain_au_chocolat,2.0,1.20
3,150041.0,2021-01-02,09:14,pain,1.0,1.15
4,150042.0,2021-01-02,09:25,traditional_baguette,5.0,1.25
...,...,...,...,...,...,...
208573,288911.0,2022-09-30,18:52,campagne,2.0,1.90
208574,288911.0,2022-09-30,18:52,traditional_baguette,5.0,1.25
208575,288911.0,2022-09-30,18:52,boule_200g,1.0,1.20
208576,288912.0,2022-09-30,18:55,traditional_baguette,1.0,1.25


In [42]:
bakery_data = merged_df
bakery_data['article_total'] = bakery_data['unit_price'] * bakery_data['quantity'] # create a new column for the product of unit_price and quantity

In [43]:
display(bakery_data)

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price,article_total
0,150040.0,2021-01-02,08:38,baguette,1.0,1.00,1.00
1,150040.0,2021-01-02,08:38,pain_au_chocolat,3.0,1.20,3.60
2,150041.0,2021-01-02,09:14,pain_au_chocolat,2.0,1.20,2.40
3,150041.0,2021-01-02,09:14,pain,1.0,1.15,1.15
4,150042.0,2021-01-02,09:25,traditional_baguette,5.0,1.25,6.25
...,...,...,...,...,...,...,...
208573,288911.0,2022-09-30,18:52,campagne,2.0,1.90,3.80
208574,288911.0,2022-09-30,18:52,traditional_baguette,5.0,1.25,6.25
208575,288911.0,2022-09-30,18:52,boule_200g,1.0,1.20,1.20
208576,288912.0,2022-09-30,18:55,traditional_baguette,1.0,1.25,1.25


In [44]:
tickets_total = bakery_data.groupby(by='ticket_number')['article_total'].sum()
tickets_total.columns = ['ticket_number', 'total_ticket']

In [45]:
display(tickets_total)

ticket_number
150040.0     4.60
150041.0     3.55
150042.0     6.25
150043.0     5.30
150044.0     1.10
            ...  
288908.0     2.70
288910.0     1.25
288911.0    11.25
288912.0     1.25
288913.0     1.25
Name: article_total, Length: 134973, dtype: float64

In [46]:
tickets_total.info()

<class 'pandas.core.series.Series'>
Index: 134973 entries, 150040.0 to 288913.0
Series name: article_total
Non-Null Count   Dtype  
--------------   -----  
134973 non-null  float64
dtypes: float64(1)
memory usage: 2.1 MB


In [47]:
tickets_total = tickets_total.to_frame(name="total_ticket")

In [48]:
display(tickets_total)

Unnamed: 0_level_0,total_ticket
ticket_number,Unnamed: 1_level_1
150040.0,4.60
150041.0,3.55
150042.0,6.25
150043.0,5.30
150044.0,1.10
...,...
288908.0,2.70
288910.0,1.25
288911.0,11.25
288912.0,1.25


In [49]:
tickets_total.info()
round(tickets_total.describe().T,2)

<class 'pandas.core.frame.DataFrame'>
Index: 134973 entries, 150040.0 to 288913.0
Data columns (total 1 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   total_ticket  134973 non-null  float64
dtypes: float64(1)
memory usage: 2.1 MB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_ticket,134973.0,4.06,4.34,0.3,1.25,2.5,5.0,247.1


In [50]:
bakery_data.to_csv('../data/clean_data/bakery_data.csv')
tickets_total.to_csv('../data/clean_data/tickets_total.csv')
bakery_price_table.to_csv('../data/clean_data/bakery_price_table.csv')