In [105]:
# Import required packages/ libraries/ frameworks from requirements.txt
import pandas as pd
import os
import re

In [106]:
# Load Data
def load_file(file_path):

    # converts filepath extension to lowercase
    ext = os.path.splitext(file_path)[-1].lower()

    # Handle .csv files
    """pd.read_csv expects a file path as its argument - above line of code will not work
    because it is being passed a DataFrame/ Series object created by load file method and saved
    to variable named 'file'
    """
    if ext == '.csv':
        return pd.read_csv(file_path)

    # Handle excel spreadsheet files
    elif ext in ['.xls', '.xlsx']:
        return pd.read_excel(file_path)
    
    # All other files
    else:
        raise ValueError("Unsupported file type")

In [107]:
# Create pandas DataFrame from ORIGINAL file on filePath
df = pd.read_csv(r'C:\Users\Cessn\OneDrive\Desktop\data_cleanser\concert_tours_by_women_TEST.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0.3                    20 non-null     int64  
 1   Unnamed: 0.2                    20 non-null     int64  
 2   Unnamed: 0.1                    20 non-null     int64  
 3   Unnamed: 0                      20 non-null     int64  
 4   rank                            20 non-null     int64  
 5   peak                            9 non-null      object 
 6   all_time_peak                   6 non-null      float64
 7   actual_gross                    20 non-null     object 
 8   adjusted_gross_in_2022_dollars  20 non-null     int64  
 9   artist                          20 non-null     object 
 10  tour_title                      20 non-null     object 
 11  years                           20 non-null     int64  
 12  shows                           20 non

In [108]:
# file = load_file(r"example_import.csv")
# THIS WORKS BUT MIGHT BE REDUNDANT
file_toBe_cleansed = load_file(r'C:\Users\Cessn\OneDrive\Desktop\data_cleanser\concert_tours_by_women_TEST.csv')
# Will this work > cleansed_file = df?
# print(file)

# df = pd.read_csv(file) > WILL NOT WORK - Yields TypeError: argument of type 'method' is not iterable (DO NOT DELETE!) df.columns reference = https://www.geeksforgeeks.org/python/python-pandas-dataframe-columns/

In [109]:
# DATA CLEANSING FUNCTIONS

# Clean column names
''' Captures all column names in DataFrame + Does following actions:
1. Strip whitespace (both left/right side of string)
2. Convert column names to all lowercase
3. Locates regex pattern specified (replaces any char that is NOT a letter or number with an underscore)
4. str.strip('_') removes any underscore characters at start/ end of string 
(https://www.w3schools.com/python/ref_string_strip.asp)
'''
def clean_column_names(df):
    df.columns = (
        df.columns.str.strip()
                  .str.lower()
                  .str.replace(r'[^a-z0-9]+', '', regex=True)
                  .str.strip('_')
    )
    return df

#CALL FUNCTION 
cleansed_file = clean_column_names(df)

In [110]:
# Remove blank rows & duplicate rows
def remove_blanks_and_duplicates(df):

    # https://www.w3schools.com/python/pandas/ref_df_dropna.asp
    # https://www.w3schools.com/python/pandas/pandas_ref_dataframe.asp
    df.dropna(how='all', inplace=True) 
    df.drop_duplicates(inplace=True)
    return df

# CALL FUNCTION
cleansed_file = remove_blanks_and_duplicates(df)

In [111]:
# # Strip leading/trailing whitespace and fix casing
def normalize_strings(df):

    '''Note: df.select_dtypes returns a subset of columns
    in DataFrame based on column data type (dtype)
    SOURCE: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html'''

    for col in df.select_dtypes(include='object'):
        df[col] = df[col].astype(str).str.strip()
        #df[col] = df[col].str.replace(r'\W', '', regex=True) # remove all non-digit chars

        # regex pattern removes any characters that are not letters and/or numbers
        # works to remove replacement character (black diamond w/ white question mark in excel)
        df[col] = df[col].str.replace(r'[^a-zA-Z0-9]', '', regex=True)
    return df
# CALL FUNCTION
cleansed_file = normalize_strings(df)

In [112]:
# # Handle missing values (customize as needed) - MORE TESTING REQUIRED
def fill_missing(df):
    for col in df.columns:
        if df[col].dtype == 'float64' or df[col].dtype == 'int64':
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna("MISSING", inplace=True)
    return df
# CALL FUNCTION
cleansed_file = fill_missing(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Column names:  Index(['unnamed03', 'unnamed02', 'unnamed01', 'unnamed0', 'rank', 'peak',
       'alltimepeak', 'actualgross', 'adjustedgrossin2022dollars', 'artist',
       'tourtitle', 'years', 'shows', 'averagegross', 'ref'],
      dtype='object')
Data types:  unnamed03                       int64
unnamed02                       int64
unnamed01                       int64
unnamed0                        int64
rank                            int64
peak                           object
alltimepeak                   float64
actualgross                    object
adjustedgrossin2022dollars      int64
artist                         object
tourtitle                      object
years                           int64
shows                           int64
averagegross                    int64
ref                            object
dtype: object
match1 is:  20232024 2023 2024


In [115]:
   """
   Working regex patterns:
   (\d{4})|(\d{2}.\d{2}.\d{4}) - #20002001 OR 08/14/2025 08-14-2025 etc
   (\d{4})(\d{4})|(\d{2}).(\d{2}).(\d{4}) - Individual matching groups - research how match groups work
   """

  """


'\nWorking regex patterns:\n(\\d{4})|(\\d{2}.\\d{2}.\\d{4}) - #20002001 OR 08/14/2025 08-14-2025 etc\n(\\d{4})(\\d{4})|(\\d{2}).(\\d{2}).(\\d{4}) - Individual matching groups - research how match groups work\n'

In [116]:
# WRITE CLEANSED FILE AS OUTPUT
cleansed_file.to_csv("concert_tours_by_women_TEST_NEW.csv")
print(f"Cleaned data saved!")

Cleaned data saved!


# Run full cleaning pipeline
def clean_data(file_path, output_path):
    df = load_file(file_path)
    print(f"Loaded {len(df)} rows and {len(df.columns)} columns.")

    df = clean_column_names(df)
    df = remove_blanks_and_duplicates(df)
    df = normalize_strings(df)
    df = fill_missing(df)

    df.to_csv(output_path, index=False)
    print(f"Cleaned data saved to: {output_path}")

# Example usage
if __name__ == "__main__":
    input_file = "example_input.csv"
    output_file = "cleaned_output.csv"
    clean_data(input_file, output_file)