# Example Data Science Analysis

This notebook demonstrates basic data science workflow using the YGfews package.

In [None]:
# Import required libraries
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ygfews import load_data, save_data, clean_data
from ygfews import plot_distribution, plot_correlation_matrix

# Set style for better-looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Create Sample Data

Let's create a sample dataset for demonstration purposes.

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Create sample data
n_samples = 1000
data = {
    'age': np.random.randint(18, 70, n_samples),
    'income': np.random.normal(50000, 20000, n_samples).clip(20000, 200000),
    'education_years': np.random.randint(8, 20, n_samples),
    'experience_years': np.random.randint(0, 40, n_samples),
    'satisfaction_score': np.random.uniform(1, 10, n_samples)
}

df = pd.DataFrame(data)
df.head()

## 2. Data Exploration

Let's explore the basic statistics of our dataset.

In [None]:
# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumn Types:")
print(df.dtypes)
print("\nBasic Statistics:")
df.describe()

## 3. Data Cleaning

Use the clean_data function to remove any issues.

In [None]:
# Add some missing values and duplicates for demonstration
df_dirty = df.copy()
df_dirty.loc[0:5, 'age'] = np.nan
df_dirty = pd.concat([df_dirty, df_dirty.iloc[0:10]], ignore_index=True)

print("Before cleaning:")
print(f"Shape: {df_dirty.shape}")
print(f"Missing values: {df_dirty.isnull().sum().sum()}")

# Clean the data
df_clean = clean_data(df_dirty)

print("\nAfter cleaning:")
print(f"Shape: {df_clean.shape}")
print(f"Missing values: {df_clean.isnull().sum().sum()}")

## 4. Visualization

Create visualizations to understand the data distribution.

In [None]:
# Plot distribution of income
fig = plot_distribution(df['income'], title='Income Distribution', bins=40)
plt.show()

In [None]:
# Plot correlation matrix
fig = plot_correlation_matrix(df)
plt.show()

## 5. Save Processed Data

Save the cleaned data for future use.

In [None]:
# Save to CSV
save_data(df_clean, '../data/processed/cleaned_data.csv', index=False)
print("Data saved successfully!")

## 6. Basic Machine Learning Example

Let's create a simple machine learning model.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Prepare features and target
X = df[['age', 'education_years', 'experience_years']]
y = df['income']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print(f"RÂ² Score: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")

## Conclusion

This notebook demonstrated:
1. Creating and exploring sample data
2. Data cleaning using utility functions
3. Creating visualizations
4. Saving processed data
5. Building a simple machine learning model