In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

data_dir = Path('../data')
csv_files = list(data_dir.glob('*.csv'))

print(f"Found {len(csv_files)} CSV files:")
for file in csv_files:
    print(f"  - {file.name}")

In [None]:

for file in csv_files:
    print(f"\n{'='*60}")
    print(f"File: {file.name}")
    print(f"{'='*60}")
    
    try:
        df_sample = pd.read_csv(file, nrows=5)
        
        print(f"Shape: {df_sample.shape}")
        print(f"\nColumns:")
        for i, col in enumerate(df_sample.columns):
            print(f"  {i+1:2d}. {col}")
        
        print(f"\nData types:")
        print(df_sample.dtypes)
        
        print(f"\nFirst few rows:")
        print(df_sample.head())
        
    except Exception as e:
        print(f"Error reading {file.name}: {e}")

In [None]:
# Check for missing values in one file as an example
sample_file = csv_files[0]
print(f"Checking missing values in {sample_file.name}")

df_sample = pd.read_csv(sample_file, nrows=1000)  # Load 1000 rows for analysis

print(f"\nMissing values per column:")
missing_counts = df_sample.isnull().sum()
print(missing_counts[missing_counts > 0])

if missing_counts.sum() == 0:
    print("\nNo missing values found in the sample.")

In [None]:
# Check for unique values in categorical columns
print(f"\nUnique values in categorical columns:")
for col in df_sample.select_dtypes(include=['object']).columns:
    unique_vals = df_sample[col].unique()
    print(f"\n{col}: {len(unique_vals)} unique values")
    if len(unique_vals) <= 10:
        print(f"  Values: {unique_vals}")
    else:
        print(f"  First 10 values: {unique_vals[:10]}")

In [None]:
# Basic statistics for numerical columns
print(f"\nBasic statistics for numerical columns:")
numerical_cols = df_sample.select_dtypes(include=[np.number]).columns
print(df_sample[numerical_cols].describe())