In [1]:
import pandas as pd
import re

def preprocess_csv(file_path):
    """
    Preprocess CSV file with music data including lyrics cleaning
    """
    print("📊 Loading CSV file...")
    df = pd.read_csv(file_path)
    print("✅ CSV file loaded successfully")

    # Strip and rename columns
    print("🔧 Stripping whitespace from column names...")
    df.columns = df.columns.str.strip()
    print("✅ Column names cleaned")

    # Drop duplicates
    print("🗑️  Removing duplicate rows...")
    initial_rows = len(df)
    df.drop_duplicates(inplace=True)
    duplicates_removed = initial_rows - len(df)
    print(f"✅ Duplicates removed: {duplicates_removed} rows")

    # Strip whitespace from string fields
    print("✂️  Stripping whitespace from string fields...")
    str_cols = df.select_dtypes(include='object').columns
    df[str_cols] = df[str_cols].apply(lambda x: x.str.strip())
    print(f"✅ Whitespace stripped from {len(str_cols)} string columns")

    # Clean lyrics: lowercase, remove special characters except basic punctuation
    if 'Lyrics' in df.columns:
        print("🎵 Cleaning lyrics data...")
        df['Lyrics'] = df['Lyrics'].astype(str).str.lower()
        df['Lyrics'] = df['Lyrics'].apply(lambda text: re.sub(r'[^a-zA-Z0-9\s\']', '', text))
        print("✅ Lyrics cleaned (lowercase, special characters removed)")
    else:
        print("⚠️  'Lyrics' column not found - skipping lyrics cleaning")

    # Convert numeric columns
    print("🔢 Converting numeric columns...")
    numeric_conversions = 0
    for col in ['key', 'energy', 'tempo']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            numeric_conversions += 1
    print(f"✅ {numeric_conversions} columns converted to numeric")

    # Drop rows with missing essential fields
    print("🧹 Removing rows with missing essential fields...")
    rows_before = len(df)
    essential = ['singer', 'song', 'key', 'energy', 'tempo', 'Lyrics']
    df.dropna(subset=essential, inplace=True)
    rows_after = len(df)
    missing_removed = rows_before - rows_after
    print(f"✅ Rows with missing essential fields removed: {missing_removed} rows")

    return df

def main():
    # File path
    input_file = 'test data.csv'
    output_file = 'preprocessed_test_data.csv'
    
    try:
        print("Starting preprocessing...")
        
        # Load and check original data
        original_df = pd.read_csv(input_file)
        print(f"Original dataset shape: {original_df.shape}")
        print(f"Original columns: {list(original_df.columns)}")
        
        # Preprocess the data
        processed_df = preprocess_csv(input_file)
        
        # Display results
        print(f"\nProcessed dataset shape: {processed_df.shape}")
        print(f"Rows removed: {original_df.shape[0] - processed_df.shape[0]}")
        
        # Show data types after preprocessing
        print("\nData types after preprocessing:")
        print(processed_df.dtypes)
        
        # Show first few rows
        print("\nFirst 5 rows of processed data:")
        print(processed_df.head())
        
        # Save processed data
        processed_df.to_csv(output_file, index=False)
        print(f"\nProcessed data saved to: {output_file}")
        
        # Show summary statistics for numeric columns
        numeric_cols = processed_df.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_cols) > 0:
            print("\nSummary statistics for numeric columns:")
            print(processed_df[numeric_cols].describe())

        # Save processed data
        processed_df.to_csv(output_file, index=False)
        print(f"\nProcessed data saved to: {output_file}")
            
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found. Please make sure the file exists in the current directory.")
    except Exception as e:
        print(f"Error during preprocessing: {str(e)}")

if __name__ == "__main__":
    main()

Starting preprocessing...
Original dataset shape: (100, 6)
Original columns: ['singer', 'song', 'energy', 'key', 'tempo', 'Lyrics']
📊 Loading CSV file...
✅ CSV file loaded successfully
🔧 Stripping whitespace from column names...
✅ Column names cleaned
🗑️  Removing duplicate rows...
✅ Duplicates removed: 0 rows
✂️  Stripping whitespace from string fields...
✅ Whitespace stripped from 3 string columns
🎵 Cleaning lyrics data...
✅ Lyrics cleaned (lowercase, special characters removed)
🔢 Converting numeric columns...
✅ 3 columns converted to numeric
🧹 Removing rows with missing essential fields...
✅ Rows with missing essential fields removed: 0 rows

Processed dataset shape: (100, 6)
Rows removed: 0

Data types after preprocessing:
singer     object
song       object
energy    float64
key         int64
tempo       int64
Lyrics     object
dtype: object

First 5 rows of processed data:
         singer                  song  energy  key  tempo  \
0  Taylor Swift          Shake It Off    0.85  