# HW3 — Python Fundamentals

In [1]:
import numpy as np
import time

arr = np.arange(1_000_000)

# loop version
start = time.time()
squares_loop = [x**2 for x in arr]
print("Loop time:", time.time() - start)

# vectorized version
start = time.time()
squares_vec = arr**2
print("Vectorized time:", time.time() - start)

print("First 5 results:", squares_vec[:5])


Loop time: 0.04809975624084473
Vectorized time: 0.0012989044189453125
First 5 results: [ 0  1  4  9 16]


In [2]:
import pandas as pd

df = pd.read_csv("../data/starter_data.csv")

print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
 2   date      10 non-null     object
dtypes: int64(1), object(2)
memory usage: 372.0+ bytes
None
  category  value    date
0        A     10  8/1/25
1        B     15  8/2/25
2        A     12  8/3/25
3        B     18  8/4/25
4        C     25  8/5/25


In [3]:
# Summary statistics for numeric columns
print("=== Summary statistics ===")
print(df.describe())

# Groupby aggregation: mean value by category
print("\n=== Average value by category ===")
grouped = df.groupby("category")["value"].mean()
print(grouped)


=== Summary statistics ===
           value
count  10.000000
mean   17.600000
std     7.381659
min    10.000000
25%    12.250000
50%    14.500000
75%    23.250000
max    30.000000

=== Average value by category ===
category
A    11.500000
B    15.666667
C    27.666667
Name: value, dtype: float64


In [4]:
import matplotlib.pyplot as plt

grouped = df.groupby("category")["value"].mean()
grouped.plot(kind="bar", title="Average Value by Category")

plt.tight_layout()
plt.savefig("../data/processed/grouped_plot.png")
plt.close()

print("Plot saved to data/processed/grouped_plot.png")


Matplotlib is building the font cache; this may take a moment.


Plot saved to data/processed/grouped_plot.png


In [5]:
# Group by category and calculate summary stats
grouped = df.groupby("category")["value"].agg(["mean", "sum", "count"])
print("Grouped summary:")
print(grouped)

# Save grouped results
grouped.to_csv("../data/processed/grouped_summary.csv")
print("Grouped summary saved to data/processed/grouped_summary.csv")


Grouped summary:
               mean  sum  count
category                       
A         11.500000   46      4
B         15.666667   47      3
C         27.666667   83      3
Grouped summary saved to data/processed/grouped_summary.csv


In [6]:
import os

def save_df(df, name, folder="../data/processed/"):
    """
    Save a DataFrame to both CSV and JSON with a given base filename.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to save
    name (str): Base filename (no extension)
    folder (str): Folder to save into (default: ../data/processed/)
    """
    os.makedirs(folder, exist_ok=True)  # make sure folder exists
    csv_path = os.path.join(folder, f"{name}.csv")
    json_path = os.path.join(folder, f"{name}.json")

    df.to_csv(csv_path, index=False)
    df.to_json(json_path, orient="records", indent=2)

    print(f"Saved {csv_path} and {json_path}")


# Example usage:
save_df(grouped, "grouped_summary_v2")


Saved ../data/processed/grouped_summary_v2.csv and ../data/processed/grouped_summary_v2.json
