In [4]:
import numpy as np
import pandas as pd

# Fix the random seed to make random numbers reproducible
np.random.seed(42)


In [6]:
# Generate 15 random ages between 18 and 60 as integers
ages = np.random.randint(18, 60, 15)
ages = pd.Series(ages, dtype="Int64")  # pandas nullable integer type

# Generate 15 random income values between 20,000 and 100,000
income = np.random.randint(20000, 100000, 15).astype(float)

# Introduce some NaN (missing) values
income[2] = np.nan
income[7] = np.nan
ages[4] = pd.NA  # Use pandas NA for nullable integer

# Combine into a DataFrame
data = pd.DataFrame({
    "Age": ages,
    "Income": income
})

print("Synthetic Data:\n", data)


Synthetic Data:
      Age   Income
0     56  87221.0
1     46  84820.0
2     32      NaN
3     25  79735.0
4   <NA>  82955.0
5     56  84925.0
6     36  87969.0
7     40      NaN
8     28  73707.0
9     28  48693.0
10    41  91932.0
11    53  45658.0
12    57  38431.0
13    41  22747.0
14    20  79150.0


# Problem 1 - Mean, Median, Age-Weighted Mean

In [8]:
# Mean income ignoring NaN
mean_income = np.nanmean(data["Income"])

# Median income ignoring NaN
median_income = np.nanmedian(data["Income"])

# Weighted mean based on Age (rows with NaN removed)
df2 = data.dropna()  # remove rows with missing Age or Income
weights = df2["Age"] / df2["Age"].sum()  # age-based weights
weighted_mean = np.average(df2["Income"], weights=weights)

print("Problem 1:")
print("Mean Income:", mean_income)
print("Median Income:", median_income)
print("Age Weighted Mean Income:", weighted_mean)

# Explanation:
# A weighted mean is preferable when some values are more "important" than others.
# In this case, incomes of older people could be given more weight if Age matters.


Problem 1:
Mean Income: 69841.76923076923
Median Income: 79735.0
Age Weighted Mean Income: 67812.39219712529


# Problem 2-Standardize Income & Detect Outliers

In [10]:
# Problem 2: Standardize Income & Detect Outliers

# Standardize Income using Z-score (ignore NaNs)
mean = np.nanmean(data["Income"])
std = np.nanstd(data["Income"])
data["Income_Z"] = (data["Income"] - mean) / std

# Identify outliers where |Z| > 3
outliers = data[data["Income_Z"].abs() > 3]

print("Problem 2:")
print("Number of Outliers:", len(outliers))
print(outliers)

# Explanation:
# - Z-score standardization transforms the data to have mean 0 and std 1.
# - Outliers are values more than 3 standard deviations away from the mean.
# - NaNs are ignored during calculations with np.nanmean and np.nanstd, so no rows are dropped unnecessarily.



Problem 2:
Number of Outliers: 0
Empty DataFrame
Columns: [Age, Income, Income_Z]
Index: []


# Problem 3 - Age Binning and Group Statistics

In [12]:
# Define age intervals (bins)
bins = [18, 25, 35, 45, 60]
labels = ["18-25", "25-35", "35-45", "45-60"]

# Assign each person to an age bin
data["AgeBin"] = pd.cut(data["Age"], bins=bins, labels=labels, right=False)

# Group by age bins and calculate count, mean income, median income
result = data.groupby("AgeBin").agg(
    Count=("Income", "count"),
    Mean=("Income", "mean"),
    Median=("Income", "median")
).sort_index()  # Sort by age bin

print("\nProblem 3 - Age Bin Statistics:\n", result)

# Explanation:
# - Count: number of people in each age bin
# - Mean: average income in that bin
# - Median: middle income value in that bin



Problem 3 - Age Bin Statistics:
         Count          Mean   Median
AgeBin                              
18-25       1  79150.000000  79150.0
25-35       3  67378.333333  73707.0
35-45       3  67549.333333  87969.0
45-60       5  68211.000000  84820.0


# Problem 4 - Array Operations

In [30]:
# Create a 2D array
arr = np.array([[1, 2, 3],
                [4, 5, 6]])

print("Problem 4 - Array Operations:")

# Shape and size
print("Shape:", arr.shape)   # (rows, columns)
print("Size:", arr.size)     # total elements

# Transpose and flatten
print("Transpose:\n", arr.T)
print("Flatten:", arr.flatten())

# Negative indexing
print("Last row:", arr[-1])

# Intentional error: accessing out-of-bounds row
try:
    print(arr[3])
except Exception as e:
    print("Error:", e)

# Arithmetic operations
print("Broadcasting +10:\n", arr + 10)
print("Dot Product:\n", np.dot(arr, arr.T))

# Linear algebra: determinant and inverse of a square matrix
mat = np.array([[4, 7],
                [2, 6]])
print("Matrix:\n", mat)
print("Determinant:", np.linalg.det(mat))
print("Inverse:\n", np.linalg.inv(mat))

# Explanation:
# - Broadcasting allows adding a scalar to every element
# - Dot product multiplies arrays
# - Determinant and inverse are basic linear algebra operations on square matrices


Problem 4 - Array Operations:
Shape: (2, 3)
Size: 6
Transpose:
 [[1 4]
 [2 5]
 [3 6]]
Flatten: [1 2 3 4 5 6]
Last row: [4 5 6]
Error: index 3 is out of bounds for axis 0 with size 2
Broadcasting +10:
 [[11 12 13]
 [14 15 16]]
Dot Product:
 [[14 32]
 [32 77]]
Matrix:
 [[4 7]
 [2 6]]
Determinant: 10.000000000000002
Inverse:
 [[ 0.6 -0.7]
 [-0.2  0.4]]
