In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# 1. LOAD THE DATA AND INITIAL CHECK
print("=== STEP 1: LOADING AND INITIAL CHECK ===")
# Load the data
df = pd.read_csv('music.csv')

# Display initial information about the dataset
print("\nInitial Data Info:")
print(df.info())

print("\nFirst few rows of data:")
print(df.head())

# 2. CHECK FOR MISSING VALUES
print("\n=== STEP 2: CHECKING FOR MISSING VALUES ===")
# Count missing values in each column
print("\nMissing values in each column:")
print(df.isnull().sum())

# 3. CHECK FOR DUPLICATES
print("\n=== STEP 3: CHECKING FOR DUPLICATES ===")
# Count duplicate rows
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# 4. CHECK DATA TYPES AND CONSISTENCY
print("\n=== STEP 4: CHECKING DATA TYPES AND VALUES ===")
print("\nData types of columns:")
print(df.dtypes)

# Check unique values in each column
print("\nUnique values in each column:")
for column in df.columns:
    print(f"\n{column}:", df[column].unique())

# 5. CLEAN THE DATA
print("\n=== STEP 5: CLEANING THE DATA ===")

# Create a copy of the dataframe for cleaning
df_clean = df.copy()

# 5.1 Handle missing values (if any)
df_clean = df_clean.dropna()

# 5.2 Remove duplicates (if any)
df_clean = df_clean.drop_duplicates()

# 5.3 Fix data types and values
# Ensure age is integer
df_clean['age'] = df_clean['age'].astype(int)

# Ensure gender is properly encoded
df_clean['gender'] = df_clean['gender'].astype(int)

# Standardize genre names (convert to title case)
df_clean['genre'] = df_clean['genre'].str.title()

# 5.4 Check for outliers in age
print("\nChecking for age outliers:")
print("\nAge statistics:")
print(df_clean['age'].describe())

# 6. VALIDATE CLEANED DATA
print("\n=== STEP 6: VALIDATING CLEANED DATA ===")
print("\nCleaned Data Info:")
print(df_clean.info())

print("\nSummary of cleaned data:")
print(df_clean.describe())

# 7. SAVE CLEANED DATA
# df_clean.to_csv('music_cleaned.csv', index=False)
print("\n=== STEP 7: CLEANING COMPLETE ===")
print("Data cleaning process completed!")

# Example of data quality checks
def check_data_quality(df):
    """Function to perform basic data quality checks"""
    quality_issues = []
    
    # Check age range
    if df['age'].min() < 0 or df['age'].max() > 120:
        quality_issues.append("Age values outside reasonable range")
        
    # Check gender values
    if not df['gender'].isin([0, 1]).all():
        quality_issues.append("Invalid gender values")
        
    # Check for empty strings in genre
    if df['genre'].str.strip().eq('').any():
        quality_issues.append("Empty genre values")
    
    return quality_issues

print("\nFinal Data Quality Check:")
issues = check_data_quality(df_clean)
if issues:
    print("Issues found:", issues)
else:
    print("No major quality issues found!")