# Importing Libraries

In [1]:
import pandas as pd
import yaml

In [2]:
try:
    with open('../config.yaml', "r") as file:
        config = yaml.safe_load(file)
except Exception:
    print("Config file not found!!!")

# Calling file and displaying it as well as his info

In [3]:
bakery_data = pd.read_csv(config['data']['input'])
display(bakery_data)
bakery_data.info()

Unnamed: 0.1,Unnamed: 0,date,time,ticket_number,article,Quantity,unit_price
0,0,2021-01-02,08:38,150040.0,BAGUETTE,1.0,"0,90 €"
1,1,2021-01-02,08:38,150040.0,PAIN AU CHOCOLAT,3.0,"1,20 €"
2,4,2021-01-02,09:14,150041.0,PAIN AU CHOCOLAT,2.0,"1,20 €"
3,5,2021-01-02,09:14,150041.0,PAIN,1.0,"1,15 €"
4,8,2021-01-02,09:25,150042.0,TRADITIONAL BAGUETTE,5.0,"1,20 €"
...,...,...,...,...,...,...,...
234000,511387,2022-09-30,18:52,288911.0,COUPE,1.0,"0,15 €"
234001,511388,2022-09-30,18:52,288911.0,BOULE 200G,1.0,"1,20 €"
234002,511389,2022-09-30,18:52,288911.0,COUPE,2.0,"0,15 €"
234003,511392,2022-09-30,18:55,288912.0,TRADITIONAL BAGUETTE,1.0,"1,30 €"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234005 entries, 0 to 234004
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     234005 non-null  int64  
 1   date           234005 non-null  object 
 2   time           234005 non-null  object 
 3   ticket_number  234005 non-null  float64
 4   article        234005 non-null  object 
 5   Quantity       234005 non-null  float64
 6   unit_price     234005 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 12.5+ MB


# Dropping column 'Unnamed: 0'

In [4]:
bakery_data.drop('Unnamed: 0', axis=1, inplace=True)

# Formating column to lowercase and replacing white spaces with underscore where it is needed

In [5]:
def format_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates a Copy of the Original DataFrame. Formats the DataFrame column names to lowercase and formats the 'article' column values to lowercase
    with underscores instead of spaces.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        df (pd.DataFrame): The modified DataFrame with lowercase column names and formatted 'Item' values.
    """

    # Rename columns to lowercase
    df.columns = df.columns.str.lower()

    # Format 'article' column values to lowercase with underscores
    df['article'] = df['article'].str.lower().str.replace(' ', '_')

    return df

# Applying the function and calling the DataFrame to check the changes

In [6]:
bakery_data = format_dataframe(bakery_data)
bakery_data.head()

Unnamed: 0,date,time,ticket_number,article,quantity,unit_price
0,2021-01-02,08:38,150040.0,baguette,1.0,"0,90 €"
1,2021-01-02,08:38,150040.0,pain_au_chocolat,3.0,"1,20 €"
2,2021-01-02,09:14,150041.0,pain_au_chocolat,2.0,"1,20 €"
3,2021-01-02,09:14,150041.0,pain,1.0,"1,15 €"
4,2021-01-02,09:25,150042.0,traditional_baguette,5.0,"1,20 €"


# Moving Column ticket_number to the beginning as it is the column that identifies all others

In [7]:
def move_column_to_beginning(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """
    Moves a given column to the beginning of a DataFrame.

    Args:
        - df (pd.DataFrame): The DataFrame to modify.
        - column_name (str): The name of the column to move.

    Returns:
        df (pd.DataFrame): The modified DataFrame with the specified column moved to the beginning.
    """
    # Identify the column name and store it in a variable
    column_to_move = df[column_name]
    
    # Drop the column
    df = df.drop(column_name, axis=1)
    
    # Insert the column in the beggining as index 0 and axis 1
    df.insert(0, column_name, column_to_move)
    
    return df

# Calling the function and checking change

In [8]:
bakery_data = move_column_to_beginning(bakery_data, 'ticket_number')
bakery_data.head()

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price
0,150040.0,2021-01-02,08:38,baguette,1.0,"0,90 €"
1,150040.0,2021-01-02,08:38,pain_au_chocolat,3.0,"1,20 €"
2,150041.0,2021-01-02,09:14,pain_au_chocolat,2.0,"1,20 €"
3,150041.0,2021-01-02,09:14,pain,1.0,"1,15 €"
4,150042.0,2021-01-02,09:25,traditional_baguette,5.0,"1,20 €"


# Calling column 'article' unique values to check for uninteresting values for the intended analysis

In [9]:
bakery_data['article'].unique()

array(['baguette', 'pain_au_chocolat', 'pain', 'traditional_baguette',
       'croissant', 'banette', 'banettine', 'special_bread', 'coupe',
       'sand_jb_emmental', 'kouign_amann', 'boule_200g', 'boule_400g',
       'gal_frangipane_6p', 'campagne', 'moisson', 'cafe_ou_eau',
       'brioche', 'cereal_baguette', 'seigle', 'complet',
       'divers_patisserie', 'gal_frangipane_4p', 'cookie', 'ficelle',
       'pain_aux_raisins', 'gal_pomme_6p', 'gal_pomme_4p', 'financier_x5',
       'vik_bread', 'divers_viennoiserie', 'gache', 'sandwich_complet',
       'pain_banette', 'grand_far_breton', 'quim_bread',
       'special_bread_kg', 'gd_kouign_amann', 'boule_polka',
       'demi_baguette', 'chausson_aux_pommes', 'baguette_graine',
       'divers_confiserie', 'sucette', 'divers_boulangerie',
       'boisson_33cl', 'pates', 'formule_sandwich', 'divers_sandwichs',
       'croissant_amandes', 'pain_choco_amandes', 'sachet_viennoiserie',
       'nantais', 'chocolat', 'pain_s/sel', 'fondant_choc

In [10]:
bakery_data['article'].nunique()

149

# Creating and applying function to drop uninteresting values from column 'articles'

In [11]:
def drop_values_from_column(df: pd.DataFrame, column: str, values: list) -> pd.DataFrame:
    """
    Drops specified values from a given column in the DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column (str): The name of the column from which values need to be dropped.
        values (list): A list of values to be dropped from the column.

    Returns:
        df (pd.DataFrame): The modified DataFrame with dropped values from the specified column.
    """

    # Drop rows containing specified values from the column
    df = df[df[column].isin(values) == False].reset_index(drop=True)

    return df

In [12]:
values_to_drop = ['coupe', 'traiteur', 'the', 'plat_6.50e', 'plat_7.60e', 'plat_7.00',
       'plat', 'plat_8.30e', 'formule_pate', 'pt_plateau_sale', 'reduction_sucrees_12', '.', 'platprepare6,50', 'platprepare5,50', 'platprepare7,00',
       'formule_plat_prepare', 'platprepare6,00', 'article_295', 'reduction_sucrees_24', 'gd_plateau_sale']

In [13]:
bakery_data = drop_values_from_column(bakery_data, 'article', config['analysis']['values_to_drop'])

### Drop rows with string 'divers' included in any column

In [14]:
bakery_data = bakery_data[~bakery_data.article.str.contains("divers")]

### checking if all the desired rows were dropped

In [15]:
bakery_data['article'].unique()

array(['baguette', 'pain_au_chocolat', 'pain', 'traditional_baguette',
       'croissant', 'banette', 'banettine', 'special_bread',
       'sand_jb_emmental', 'kouign_amann', 'boule_200g', 'boule_400g',
       'gal_frangipane_6p', 'campagne', 'moisson', 'cafe_ou_eau',
       'brioche', 'cereal_baguette', 'seigle', 'complet',
       'gal_frangipane_4p', 'cookie', 'ficelle', 'pain_aux_raisins',
       'gal_pomme_6p', 'gal_pomme_4p', 'financier_x5', 'vik_bread',
       'gache', 'sandwich_complet', 'pain_banette', 'grand_far_breton',
       'quim_bread', 'special_bread_kg', 'gd_kouign_amann', 'boule_polka',
       'demi_baguette', 'chausson_aux_pommes', 'baguette_graine',
       'sucette', 'boisson_33cl', 'pates', 'formule_sandwich',
       'croissant_amandes', 'pain_choco_amandes', 'sachet_viennoiserie',
       'nantais', 'chocolat', 'pain_s/sel', 'fondant_chocolat',
       'gal_poire_choco_6p', 'gal_poire_choco_4p', 'galette_8_pers',
       'sand_jb', 'sachet_de_crouton', 'grande_sucette

In [16]:
bakery_data['article'].nunique()

123

In [17]:
bakery_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 209254 entries, 0 to 211702
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ticket_number  209254 non-null  float64
 1   date           209254 non-null  object 
 2   time           209254 non-null  object 
 3   article        209254 non-null  object 
 4   quantity       209254 non-null  float64
 5   unit_price     209254 non-null  object 
dtypes: float64(2), object(4)
memory usage: 11.2+ MB


# Creating and applying function to convert column to float as well as replacing and removing symbols

In [18]:
def convert_column_to_float(df: pd.DataFrame, column_name: str, symbol_to_remove: str) -> pd.DataFrame:
    """
    Convert a column in a DataFrame from object to float64 type,
    remove a specified symbol, and substitute ',' with '.' in its values.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the column.
        column_name (str): The name of the column to convert.
        symbol_to_remove (str): The symbol to remove from the column values.
    
    Returns:
        df (pd.DataFrame): The modified DataFrame with the specified column converted to float64 type,
                      the symbol removed from its values, and ',' substituted with '.'.
    """
    
    # Remove specified symbol from the column values in the copied DataFrame
    df[column_name] = df[column_name].str.replace(symbol_to_remove, '')
    
    # Substitute ',' with '.' in the column values in the copied DataFrame
    df[column_name] = df[column_name].str.replace(',', '.')
    
    # Convert the copied column to float64 type in the copied DataFrame
    df[column_name] = df[column_name].astype('float64')
    
    return df

In [19]:
bakery_data = convert_column_to_float(bakery_data, 'unit_price', '€')

### Checking changes to Dataframe

In [20]:
bakery_data.info()
round(bakery_data.describe().T, 2)

<class 'pandas.core.frame.DataFrame'>
Index: 209254 entries, 0 to 211702
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ticket_number  209254 non-null  float64
 1   date           209254 non-null  object 
 2   time           209254 non-null  object 
 3   article        209254 non-null  object 
 4   quantity       209254 non-null  float64
 5   unit_price     209254 non-null  float64
dtypes: float64(3), object(3)
memory usage: 11.2+ MB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ticket_number,209254.0,219300.71,40091.99,150040.0,184723.25,218723.5,254214.75,288913.0
quantity,209254.0,1.59,1.34,-200.0,1.0,1.0,2.0,200.0
unit_price,209254.0,1.72,1.55,0.0,1.15,1.25,1.5,35.0


# Dropping negative values from 'quantity' column as they are uninteristing and only indicate removals from transactions at checkout

In [21]:
bakery_data = bakery_data.drop(bakery_data.index[bakery_data['quantity'] < 0])

In [22]:
round(bakery_data.describe().T, 2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ticket_number,208081.0,219268.5,40107.87,150040.0,184688.0,218573.0,254255.0,288913.0
quantity,208081.0,1.6,1.24,1.0,1.0,1.0,2.0,200.0
unit_price,208081.0,1.72,1.55,0.0,1.15,1.25,1.5,35.0


In [23]:
bakery_data.describe(include=object).T

Unnamed: 0,count,unique,top,freq
date,208081,600,2022-08-14,919
time,208081,683,12:11,770
article,208081,123,traditional_baguette,67349


# Checking 'unit_price' column for uninteresting values, like 0.00

In [24]:
bakery_data['unit_price'].unique()

array([ 0.9 ,  1.2 ,  1.15,  1.1 ,  1.05,  0.6 ,  2.4 ,  3.5 ,  2.1 ,
        1.5 , 12.  ,  1.8 ,  2.  ,  1.  ,  5.  ,  1.25,  8.  ,  1.4 ,
        3.  ,  2.5 ,  4.5 ,  7.  ,  4.8 ,  7.5 ,  1.6 ,  0.45,  1.3 ,
        0.3 ,  6.5 ,  4.  , 16.  ,  9.  ,  0.8 , 18.  ,  0.  ,  0.7 ,
        6.  , 11.  , 14.  , 21.  , 28.  ,  0.95,  0.5 ,  0.65,  2.2 ,
        1.9 ,  1.7 ,  2.6 ,  1.35, 10.  ,  2.3 ,  5.2 ,  2.7 , 35.  ])

In [25]:
display(bakery_data['unit_price'].sort_values())

14414      0.0
145804     0.0
169019     0.0
141508     0.0
36176      0.3
          ... 
172347    21.0
157691    21.0
188955    21.0
116349    28.0
179962    35.0
Name: unit_price, Length: 208081, dtype: float64

# Dropping 0.00 values from 'unit_price' as they are uninteresting for the analysis in course

In [26]:
bakery_data = bakery_data[bakery_data["unit_price"] != 0.00]

In [27]:
display(bakery_data["unit_price"].sort_values())

179100     0.3
24917      0.3
36083      0.3
147288     0.3
25031      0.3
          ... 
148227    21.0
172347    21.0
177088    21.0
116349    28.0
179962    35.0
Name: unit_price, Length: 208077, dtype: float64

In [28]:
bakery_data['unit_price'].unique()

array([ 0.9 ,  1.2 ,  1.15,  1.1 ,  1.05,  0.6 ,  2.4 ,  3.5 ,  2.1 ,
        1.5 , 12.  ,  1.8 ,  2.  ,  1.  ,  5.  ,  1.25,  8.  ,  1.4 ,
        3.  ,  2.5 ,  4.5 ,  7.  ,  4.8 ,  7.5 ,  1.6 ,  0.45,  1.3 ,
        0.3 ,  6.5 ,  4.  , 16.  ,  9.  ,  0.8 , 18.  ,  0.7 ,  6.  ,
       11.  , 14.  , 21.  , 28.  ,  0.95,  0.5 ,  0.65,  2.2 ,  1.9 ,
        1.7 ,  2.6 ,  1.35, 10.  ,  2.3 ,  5.2 ,  2.7 , 35.  ])

In [29]:
bakery_data.info()
round(bakery_data.describe().T, 2)

<class 'pandas.core.frame.DataFrame'>
Index: 208077 entries, 0 to 211702
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ticket_number  208077 non-null  float64
 1   date           208077 non-null  object 
 2   time           208077 non-null  object 
 3   article        208077 non-null  object 
 4   quantity       208077 non-null  float64
 5   unit_price     208077 non-null  float64
dtypes: float64(3), object(3)
memory usage: 11.1+ MB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ticket_number,208077.0,219268.35,40107.86,150040.0,184688.0,218572.0,254255.0,288913.0
quantity,208077.0,1.6,1.24,1.0,1.0,1.0,2.0,200.0
unit_price,208077.0,1.72,1.55,0.3,1.15,1.25,1.5,35.0


In [30]:
bakery_data.describe(include=object).T

Unnamed: 0,count,unique,top,freq
date,208077,600,2022-08-14,919
time,208077,683,12:11,770
article,208077,123,traditional_baguette,67349


# Creating DataFrame called bakery_price_table with only 'article' and 'unit_price' sorted by 'article' in ascending order

In [31]:
bakery_data_table = bakery_data.copy()

In [32]:
bakery_price_table = bakery_data_table[['article','unit_price']]
bakery_price_table

Unnamed: 0,article,unit_price
0,baguette,0.90
1,pain_au_chocolat,1.20
2,pain_au_chocolat,1.20
3,pain,1.15
4,traditional_baguette,1.20
...,...,...
211698,campagne,2.00
211699,traditional_baguette,1.30
211700,boule_200g,1.20
211701,traditional_baguette,1.30


# Dropping Duplicates to standardize prices

In [33]:
bakery_price_table = bakery_price_table.drop_duplicates()

In [34]:
bakery_price_table = bakery_price_table.reset_index(drop=True)

In [35]:
bakery_price_table = bakery_price_table.sort_values(by=['unit_price'], ignore_index=True)
bakery_price_table = bakery_price_table.drop_duplicates(subset='article', keep="last").reset_index(drop=True)

In [36]:
display(bakery_price_table)
bakery_price_table.info()

Unnamed: 0,article,unit_price
0,sucette,0.3
1,demi_baguette,0.5
2,bottereau,0.5
3,grande_sucette,0.6
4,pain_noir,0.6
...,...,...
118,tarte_fraise_6p,21.0
119,buche_6pers,21.0
120,royal_6p,21.0
121,buche_8pers,28.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   article     123 non-null    object 
 1   unit_price  123 non-null    float64
dtypes: float64(1), object(1)
memory usage: 2.0+ KB


In [37]:
round(bakery_price_table.describe().T, 2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
unit_price,123.0,4.6,5.6,0.3,1.6,2.5,5.0,35.0


# Applying 'unit_price' changes in bakery_price_table to bakery_table and standardize the 'unit_price' values between the two of them

In [38]:
merged_df = bakery_data.merge(bakery_price_table, on='article', how='left')
merged_df['unit_price_x'].fillna(merged_df['unit_price_y'], inplace=True)
merged_df.drop(['unit_price_x'], axis=1, inplace=True)
merged_df.rename(columns={'unit_price_y': 'unit_price'}, inplace=True)

display(merged_df)

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price
0,150040.0,2021-01-02,08:38,baguette,1.0,1.0
1,150040.0,2021-01-02,08:38,pain_au_chocolat,3.0,1.3
2,150041.0,2021-01-02,09:14,pain_au_chocolat,2.0,1.3
3,150041.0,2021-01-02,09:14,pain,1.0,1.3
4,150042.0,2021-01-02,09:25,traditional_baguette,5.0,1.3
...,...,...,...,...,...,...
208072,288911.0,2022-09-30,18:52,campagne,2.0,2.0
208073,288911.0,2022-09-30,18:52,traditional_baguette,5.0,1.3
208074,288911.0,2022-09-30,18:52,boule_200g,1.0,1.2
208075,288912.0,2022-09-30,18:55,traditional_baguette,1.0,1.3


In [39]:
bakery_data = merged_df

# Creating a new column for the final value ('article_total') of 'unit_price' multiplied by 'quantity'

In [40]:
bakery_data['article_total'] = bakery_data['unit_price'] * bakery_data['quantity'] 

### Checking new column 'article_total' and the changes to bakery_data

In [41]:
display(bakery_data)
bakery_data.info()
round(bakery_data.describe().T, 2)

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price,article_total
0,150040.0,2021-01-02,08:38,baguette,1.0,1.0,1.0
1,150040.0,2021-01-02,08:38,pain_au_chocolat,3.0,1.3,3.9
2,150041.0,2021-01-02,09:14,pain_au_chocolat,2.0,1.3,2.6
3,150041.0,2021-01-02,09:14,pain,1.0,1.3,1.3
4,150042.0,2021-01-02,09:25,traditional_baguette,5.0,1.3,6.5
...,...,...,...,...,...,...,...
208072,288911.0,2022-09-30,18:52,campagne,2.0,2.0,4.0
208073,288911.0,2022-09-30,18:52,traditional_baguette,5.0,1.3,6.5
208074,288911.0,2022-09-30,18:52,boule_200g,1.0,1.2,1.2
208075,288912.0,2022-09-30,18:55,traditional_baguette,1.0,1.3,1.3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208077 entries, 0 to 208076
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ticket_number  208077 non-null  float64
 1   date           208077 non-null  object 
 2   time           208077 non-null  object 
 3   article        208077 non-null  object 
 4   quantity       208077 non-null  float64
 5   unit_price     208077 non-null  float64
 6   article_total  208077 non-null  float64
dtypes: float64(4), object(3)
memory usage: 11.1+ MB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ticket_number,208077.0,219268.35,40107.86,150040.0,184688.0,218572.0,254255.0,288913.0
quantity,208077.0,1.6,1.24,1.0,1.0,1.0,2.0,200.0
unit_price,208077.0,1.84,1.66,0.3,1.2,1.3,1.7,35.0
article_total,208077.0,2.73,2.63,0.3,1.3,2.0,2.7,200.0


In [42]:
bakery_data.describe(include=object).T

Unnamed: 0,count,unique,top,freq
date,208077,600,2022-08-14,919
time,208077,683,12:11,770
article,208077,123,traditional_baguette,67349


# Creating new DataFrame with ticket_number and total_ticket, which will be the values of 'article_total' aggregated (with sum) by 'ticket_number'

In [43]:
tickets_total = bakery_data.groupby(by='ticket_number')['article_total'].sum()
tickets_total.columns = ['ticket_number', 'total_ticket']

In [44]:
display(tickets_total)

ticket_number
150040.0     4.90
150041.0     3.90
150042.0     6.50
150043.0     5.60
150044.0     1.15
            ...  
288908.0     2.70
288910.0     1.30
288911.0    11.70
288912.0     1.30
288913.0     1.30
Name: article_total, Length: 134886, dtype: float64

In [45]:
tickets_total.info()

<class 'pandas.core.series.Series'>
Index: 134886 entries, 150040.0 to 288913.0
Series name: article_total
Non-Null Count   Dtype  
--------------   -----  
134886 non-null  float64
dtypes: float64(1)
memory usage: 2.1 MB


In [46]:
tickets_total = tickets_total.to_frame(name="total_ticket")

In [47]:
display(tickets_total)

Unnamed: 0_level_0,total_ticket
ticket_number,Unnamed: 1_level_1
150040.0,4.90
150041.0,3.90
150042.0,6.50
150043.0,5.60
150044.0,1.15
...,...
288908.0,2.70
288910.0,1.30
288911.0,11.70
288912.0,1.30


In [48]:
tickets_total.info()
round(tickets_total.describe().T, 2)

<class 'pandas.core.frame.DataFrame'>
Index: 134886 entries, 150040.0 to 288913.0
Data columns (total 1 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   total_ticket  134886 non-null  float64
dtypes: float64(1)
memory usage: 2.1 MB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_ticket,134886.0,4.2,4.45,0.3,1.3,2.6,5.2,247.1


# Saving clean and formatted DataFrames in .csv files to work with later

In [49]:
bakery_data.to_csv(config['data']['output']['bakery_data'])
tickets_total.to_csv(config['data']['output']['tickets_total'])
bakery_price_table.to_csv(config['data']['output']['bakery_price'])

# Creating Main Function with the functions used to clean and format the original DataFrame and saving it in function.py file and moving file to src folder

In [50]:
%%writefile clean_format_functions.py

def format_dataframe(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """
    Creates a Copy of the Original DataFrame. Formats the DataFrame column names to lowercase and formats the specified column values to lowercase
    with underscores instead of spaces.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_name: The column to alter
    Returns:
        df (pd.DataFrame): The modified DataFrame with lowercase column names and formatted 'Item' values.
    """

    # Rename columns to lowercase
    df.columns = df.columns.str.lower()

    # Format column values to lowercase with underscores
    df[column_name] = df[column_name].str.lower().str.replace(' ', '_')

    return df

def move_column_to_beginning(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """
    Moves a given column to the beginning of a DataFrame.

    Args:
        - df (pd.DataFrame): The DataFrame to modify.
        - column_name (str): The name of the column to move.

    Returns:
        df (pd.DataFrame): The modified DataFrame with the specified column moved to the beginning.
    """
    # Identify the column name and store it in a variable
    column_to_move = df[column_name]
    
    # Drop the column
    df = df.drop(column_name, axis=1)
    
    # Insert the column in the beginning as index 0 and axis 1
    df.insert(0, column_name, column_to_move)
    
    return df

def drop_values_from_column(df: pd.DataFrame, column: str, values: list) -> pd.DataFrame:
    """
    Drops specified values from a given column in the DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column (str): The name of the column from which values need to be dropped.
        values (list): A list of values to be dropped from the column.

    Returns:
        df (pd.DataFrame): The modified DataFrame with dropped values from the specified column.
    """

    # Drop rows containing specified values from the column
    df = df[df[column].isin(values) == False].reset_index(drop=True)

    return df

def convert_column_to_float(df: pd.DataFrame, column_name: str, symbol_to_remove: str) -> pd.DataFrame:
    """
    Convert a column in a DataFrame from object to float64 type,
    remove a specified symbol, and substitute ',' with '.' in its values.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the column.
        column_name (str): The name of the column to convert.
        symbol_to_remove (str): The symbol to remove from the column values.
    
    Returns:
        df (pd.DataFrame): The modified DataFrame with the specified column converted to float64 type,
                      the symbol removed from its values, and ',' substituted with '.'.
    """
    
    # Remove specified symbol from the column values in the copied DataFrame
    df[column_name] = df[column_name].str.replace(symbol_to_remove, '')
    
    # Substitute ',' with '.' in the column values in the copied DataFrame
    df[column_name] = df[column_name].str.replace(',', '.')
    
    # Convert the copied column to float64 type in the copied DataFrame
    df[column_name] = df[column_name].astype('float64')
    
    return df


Writing clean_format_functions.py


In [51]:
!move = {config['functions']['source_path']} {config['functions']['destination_path']}

        1 file(s) moved.
