# Step 5: Data Wrangling
This notebook covers the data cleaning, handling of missing values, outlier detection, and optional exploratory data analysis for the 'Agentic Coding Mentor' capstone project.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Load and Inspect the Dataset

In [None]:
# Load the dataset (replace with your actual path or source)
df = pd.read_csv("data/raw/synthetic_python_errors.csv")  # placeholder file
df.head()

In [None]:
# Basic info
df.info()

# Summary statistics
df.describe()

## Data Cleaning

In [None]:
# Drop rows with missing critical fields
df.dropna(subset=['raw_code', 'fixed_code'], inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Trim whitespace
df['raw_code'] = df['raw_code'].str.strip()
df['fixed_code'] = df['fixed_code'].str.strip()
df['explanation'] = df['explanation'].str.strip() if 'explanation' in df.columns else None

## Outlier Detection and Handling

In [None]:
# Add code length column
df['code_length'] = df['raw_code'].apply(lambda x: len(str(x).split()))

# View statistics
df['code_length'].describe()

# Remove overly long code snippets (arbitrary threshold for this project)
df = df[df['code_length'] < 1000]

## Subsample the Dataset (if needed)

In [None]:
# If dataset is too large
df_sample = df.sample(n=10000, random_state=42) if len(df) > 10000 else df

## Exploratory Data Analysis (Optional)

In [None]:
# Error type distribution (if exists)
if 'error_type' in df.columns:
    sns.countplot(x='error_type', data=df)
    plt.title("Error Type Distribution")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()