In [None]:
import numpy as np

# Create arrays and demonstrate elementwise operations
arr = np.array([1, 2, 3, 4, 5])
print("Original array:", arr)
print("Squared array:", arr ** 2)
print("Logarithm of array:", np.log(arr))

# Compare loop vs vectorized operations
def square_with_loop(array):
    result = []
    for x in array:
        result.append(x ** 2)
    return result

print("\nTiming comparison:")
%timeit square_with_loop(arr)  # Loop method
%timeit arr ** 2              # Vectorized method

# %% [markdown]
# ## 2. Dataset Loading

# %%
import pandas as pd

# Load and inspect data
df = pd.read_csv('data/starter_data.csv')
print("\nData Info:")
print(df.info())

print("\nFirst 5 rows:")
display(df.head())

# %% [markdown]
# ## 3. Summary Statistics

# %%
# Basic statistics
print("Descriptive Statistics:")
display(df.describe())

# Groupby operations
print("\nGrouped Statistics (by category):")
grouped = df.groupby('category')['value'].agg(['mean', 'median', 'std', 'count'])
display(grouped)

# %% [markdown]
# ## 4. Save Outputs

# %%
import os

# Create processed directory if it doesn't exist
os.makedirs('data/processed', exist_ok=True)

# Save outputs
df.describe().to_csv('data/processed/summary.csv')
grouped.to_json('data/processed/grouped_stats.json')

# Bonus: Basic plot
import matplotlib.pyplot as plt
df.groupby('category')['value'].mean().plot(kind='bar')
plt.title('Average Value by Category')
plt.ylabel('Value')
plt.savefig('data/processed/value_by_category.png')
plt.show()

# %% [markdown]
# ## 5. Reusable Functions

# %%
def get_summary_stats(dataframe, group_col, value_col):
    """
    Generate and save summary statistics for a DataFrame.
    
    Args:
        dataframe (pd.DataFrame): Input data
        group_col (str): Column to group by
        value_col (str): Numeric column to analyze
    
    Returns:
        pd.DataFrame: Grouped statistics
    """
    stats = dataframe.groupby(group_col)[value_col].agg(['mean', 'median', 'std', 'count'])
    stats.to_csv(f'data/processed/{group_col}_summary.csv')
    return stats

# Test the function
category_stats = get_summary_stats(df, 'category', 'value')
display(category_stats)

# %% [markdown]
# ### Bonus: Move Function to utils.py

# %%
# Create src directory if it doesn't exist
os.makedirs('src', exist_ok=True)

# Write function to utils.py
with open('src/utils.py', 'w') as f:
    f.write("""import pandas as pd

def get_summary_stats(dataframe, group_col, value_col):
    \"\"\"Generate and save summary statistics for a DataFrame.\"\"\"
    stats = dataframe.groupby(group_col)[value_col].agg(['mean', 'median', 'std', 'count'])
    stats.to_csv(f'data/processed/{group_col}_summary.csv')
    return stats
""")

# Import and test
from src.utils import get_summary_stats
get_summary_stats(df, 'category', 'value')