In [15]:
# ===============================
# Data Handling Assignment
# ===============================

import numpy as np
import pandas as pd

# Fix the random seed to make random numbers reproducible
np.random.seed(42)

# ===============================
# Dataset Creation
# ===============================
# Generate 15 random ages between 18 and 60
# Convert to float so we can insert NaN (missing value)
ages = np.random.randint(18, 60, 15).astype(float)

# Generate 15 random income values between 20,000 and 100,000
# Convert to float to allow NaN
income = np.random.randint(20000, 100000, 15).astype(float)

# Introduce some NaN (missing) values to simulate real-world incomplete data
income[2] = np.nan   # 3rd income missing
income[7] = np.nan   # 8th income missing
ages[4] = np.nan     # 5th age missing

# Combine ages and income into a pandas DataFrame
# Each row represents a person
data = pd.DataFrame({
    "Age": ages,
    "Income": income
})
print("Synthetic Data:\n", data)

# ===============================
# Problem 1: Mean, Median, Weighted Mean
# ===============================
# Mean income ignoring NaN values
mean_income = np.nanmean(data["Income"])

# Median income ignoring NaN values
median_income = np.nanmedian(data["Income"])

# Weighted mean: gives more importance to certain ages
# Drop rows with missing Age or Income
df2 = data.dropna()

# Create weights based on age proportion
weights = df2["Age"] / df2["Age"].sum()

# Calculate weighted mean of income
weighted_mean = np.average(df2["Income"], weights=weights)

print("\nProblem 1:")
print("Mean Income:", mean_income)
print("Median Income:", median_income)
print("Age Weighted Mean Income:", weighted_mean)

# ===============================
# Problem 2: Standardize Income & Detect Outliers
# ===============================
# Calculate mean and standard deviation ignoring NaN
mean = np.nanmean(data["Income"])
std = np.nanstd(data["Income"])

# Standardize incomes using Z-score
data["Income_Z"] = (data["Income"] - mean) / std

# Identify outliers: incomes with |Z| > 3
outliers = data[(data["Income_Z"].abs() > 3)]
print("\nProblem 2:")
print("Number of Outliers:", len(outliers))
print(outliers)

# ===============================
# Problem 3: Age Binning and Group Statistics
# ===============================
# Define age intervals (bins)
bins = [18, 25, 35, 45, 60]
labels = ["18-25", "25-35", "35-45", "45-60"]

# Assign each person's age to a bin
data["AgeBin"] = pd.cut(data["Age"], bins=bins, labels=labels, right=False)

# Group by age bins and calculate:
# - Count of people in each bin
# - Mean income in each bin
# - Median income in each bin
result = data.groupby("AgeBin").agg(
    Count=("Income", "count"),
    Mean=("Income", "mean"),
    Median=("Income", "median")
)

print("\nProblem 3:\n", result)

# ===============================
# Problem 4: Array Operations
# ===============================
# Create a 2D array (matrix) with 2 rows and 3 columns
arr = np.array([[1, 2, 3],
                [4, 5, 6]])

print("\nProblem 4:")

# Shape and size of the array
print("Shape:", arr.shape)   # (rows, columns)
print("Size:", arr.size)     # total number of elements

# Transpose and flatten
print("Transpose:\n", arr.T)       # swap rows and columns
print("Flatten:", arr.flatten())   # convert 2D array to 1D

# Negative indexing: last row
print("Last row:", arr[-1])

# Intentional error: trying to access row that does not exist
try:
    print(arr[3])   # IndexError because only 2 rows exist
except Exception as e:
    print("Error:", e)

# Arithmetic operation: add 10 to all elements (broadcasting)
print("Broadcasting +10:\n", arr + 10)

# Dot product (matrix multiplication) of arr and its transpose
print("Dot Product:\n", np.dot(arr, arr.T))

# Linear algebra operations with a square matrix
mat = np.array([[4, 7],
                [2, 6]])

# Display the square matrix
print("Matrix:\n", mat)

# Calculate determinant
# Determinant helps check if a matrix is invertible
print("Determinant:", np.linalg.det(mat))

# Calculate inverse
# Inverse of a matrix such that mat * mat_inv = identity matrix
print("Inverse:\n", np.linalg.inv(mat))


Synthetic Data:
      Age   Income
0   56.0  87221.0
1   46.0  84820.0
2   32.0      NaN
3   25.0  79735.0
4    NaN  82955.0
5   56.0  84925.0
6   36.0  87969.0
7   40.0      NaN
8   28.0  73707.0
9   28.0  48693.0
10  41.0  91932.0
11  53.0  45658.0
12  57.0  38431.0
13  41.0  22747.0
14  20.0  79150.0

Problem 1:
Mean Income: 69841.76923076923
Median Income: 79735.0
Age Weighted Mean Income: 67812.39219712527

Problem 2:
Number of Outliers: 0
Empty DataFrame
Columns: [Age, Income, Income_Z]
Index: []

Problem 3:
         Count          Mean   Median
AgeBin                              
18-25       1  79150.000000  79150.0
25-35       3  67378.333333  73707.0
35-45       3  67549.333333  87969.0
45-60       5  68211.000000  84820.0

Problem 4:
Shape: (2, 3)
Size: 6
Transpose:
 [[1 4]
 [2 5]
 [3 6]]
Flatten: [1 2 3 4 5 6]
Last row: [4 5 6]
Error: index 3 is out of bounds for axis 0 with size 2
Broadcasting +10:
 [[11 12 13]
 [14 15 16]]
Dot Product:
 [[14 32]
 [32 77]]
Matrix:
 [[4 7]
