In [1]:
!pip install pandas




In [20]:
import pandas as pd
import glob

def clean_csv(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Print column names to check for discrepancies
    print(f"Processing file: {file_path}")
    print(f"Columns: {df.columns.tolist()}")
    
    # Normalize column names to lowercase and strip leading/trailing whitespaces
    df.columns = [col.lower().strip() for col in df.columns]
    
    # Check for the presence of required columns
    if 'labour force characteristics' not in df.columns or 'statistics' not in df.columns:
        print(f"Skipping file due to missing columns: {file_path}")
        return None
    
    # Print unique values in the relevant columns to debug
    print(f"Unique values in 'labour force characteristics' column: {df['labour force characteristics'].unique()}")
    print(f"Unique values in 'statistics' column: {df['statistics'].unique()}")
    
    # Filter rows where 'labour force characteristics' or 'statistics' is 'Unemployment rate' (case-insensitive)
    df_filtered = df[
        df['labour force characteristics'].str.contains('unemployment rate', case=False, na=False) |
        df['statistics'].str.contains('unemployment rate', case=False, na=False)
    ]
    
    # Print filtered data to debug
    print(f"Filtered data for {file_path}:")
    print(df_filtered)
    
    # Select the required columns
    columns_needed = ['ref_date', 'geo', 'age group', 'gender', 'value']
    df_cleaned = df_filtered[columns_needed]
    
    # Save the cleaned data to a new CSV file
    cleaned_file_path = file_path.replace('.csv', '_cleaned.csv')
    df_cleaned.to_csv(cleaned_file_path, index=False)
    
    return cleaned_file_path

# Path to the directory containing the CSV files
csv_directory = r"C:\\Users\\bibis\\Desktop\\Dataset for capstone unemployment"

# Get a list of all CSV files in the directory
csv_files = glob.glob(csv_directory + '/*.csv')

# Process each CSV file
cleaned_files = []
for csv_file in csv_files:
    cleaned_file = clean_csv(csv_file)
    if cleaned_file:
        cleaned_files.append(cleaned_file)
    
print(f'Cleaned files: {cleaned_files}')


Processing file: C:\\Users\\bibis\\Desktop\\Dataset for capstone unemployment\April 2021.csv
Columns: ['REF_DATE', 'GEO', 'DGUID', 'Labour force characteristics', 'Gender', 'Age group', 'Statistics', 'Data type', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']
Unique values in 'labour force characteristics' column: ['Population' 'Labour force' 'Employment' 'Full-time employment'
 'Part-time employment' 'Unemployment' 'Unemployment rate'
 'Participation rate' 'Employment rate']
Unique values in 'statistics' column: ['Estimate']
Filtered data for C:\\Users\\bibis\\Desktop\\Dataset for capstone unemployment\April 2021.csv:
   ref_date                        geo           dguid  \
6   2021-04                     Canada  2021A000011124   
15  2021-04  Newfoundland and Labrador     2021A000210   
24  2021-04       Prince Edward Island     2021A000211   
33  2021-04                Nova Scotia     2021A000212   
42  

In [21]:
import pandas as pd
import glob

# Path to the directory containing the cleaned CSV files
csv_directory = r"C:\\Users\\bibis\\Desktop\\Dataset for capstone unemployment"

# Get a list of all cleaned CSV files in the directory
cleaned_csv_files = glob.glob(csv_directory + '/*_cleaned.csv')

# Initialize an empty list to store DataFrames
data_frames = []

# Read each cleaned CSV file and append it to the list
for cleaned_csv in cleaned_csv_files:
    df = pd.read_csv(cleaned_csv)
    data_frames.append(df)

# Concatenate all DataFrames into a single DataFrame
merged_df = pd.concat(data_frames, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_file_path = csv_directory + '\\merged_cleaned_data.csv'
merged_df.to_csv(merged_file_path, index=False)

print(f'Merged file saved as: {merged_file_path}')


Merged file saved as: C:\\Users\\bibis\\Desktop\\Dataset for capstone unemployment\merged_cleaned_data.csv
