# 5.1 Introduction to Machine Learning Tutorial

This notebook covers key concepts in machine learning including:
- What is Machine Learning?
- ML Workflow
- Feature Engineering
- Bias-Variance Tradeoff

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Set random seed for reproducibility
np.random.seed(42)

## 1. What is Machine Learning?

Let's demonstrate the basic concept of machine learning through a simple example of pattern recognition.

In [None]:
# Generate sample data with a pattern
X = np.linspace(0, 10, 100).reshape(-1, 1)
y = 2 * X.ravel() + 1 + np.random.normal(0, 1, 100)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Visualize the results
plt.figure(figsize=(10, 6))
plt.scatter(X_train, y_train, color='blue', alpha=0.5, label='Training Data')
plt.scatter(X_test, y_test, color='green', alpha=0.5, label='Testing Data')
plt.plot(X_test, y_pred, color='red', label='Predictions')
plt.title('Simple Machine Learning Example')
plt.xlabel('Input Feature (X)')
plt.ylabel('Target Variable (y)')
plt.legend()
plt.show()

print(f"Model Equation: y = {model.coef_[0]:.2f}x + {model.intercept_:.2f}")
print(f"True Equation: y = 2x + 1 + noise")

## 2. ML Workflow

Let's walk through a typical machine learning workflow.

In [None]:
# 1. Data Collection (simulated)
n_samples = 1000
n_features = 3
X = np.random.randn(n_samples, n_features)
y = 3*X[:, 0] + 2*X[:, 1] - X[:, 2] + np.random.normal(0, 0.1, n_samples)

# 2. Data Preprocessing
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Model Training
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# 4. Model Evaluation
train_pred = model.predict(X_train_scaled)
test_pred = model.predict(X_test_scaled)

train_mse = mean_squared_error(y_train, train_pred)
test_mse = mean_squared_error(y_test, test_pred)

print("Model Performance:")
print(f"Training MSE: {train_mse:.4f}")
print(f"Testing MSE: {test_mse:.4f}")

# Visualize predictions vs actual
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.scatter(y_train, train_pred, alpha=0.5)
ax1.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')
ax1.set_title('Training: Predicted vs Actual')
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values')

ax2.scatter(y_test, test_pred, alpha=0.5)
ax2.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
ax2.set_title('Testing: Predicted vs Actual')
ax2.set_xlabel('Actual Values')
ax2.set_ylabel('Predicted Values')

plt.tight_layout()
plt.show()

## 3. Feature Engineering

Let's explore different feature engineering techniques.

In [None]:
# Create sample data with different types of features
np.random.seed(42)
n_samples = 1000

# Original features
raw_feature1 = np.random.normal(100, 20, n_samples)  # Continuous
raw_feature2 = np.random.choice(['A', 'B', 'C'], n_samples)  # Categorical
raw_feature3 = np.exp(np.random.normal(0, 1, n_samples))  # Skewed

# Feature Engineering Techniques

# 1. Standardization
standardized = StandardScaler().fit_transform(raw_feature1.reshape(-1, 1))

# 2. One-hot Encoding
onehot = pd.get_dummies(raw_feature2, prefix='category')

# 3. Log Transformation
log_transformed = np.log(raw_feature3)

# Visualize transformations
fig, axes = plt.subplots(3, 2, figsize=(15, 12))

# Original vs Standardized
sns.histplot(raw_feature1, ax=axes[0, 0])
axes[0, 0].set_title('Original Continuous Feature')

sns.histplot(standardized, ax=axes[0, 1])
axes[0, 1].set_title('Standardized Feature')

# Original vs One-hot Encoded
sns.countplot(raw_feature2, ax=axes[1, 0])
axes[1, 0].set_title('Original Categorical Feature')

onehot.sum().plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('One-hot Encoded Features')

# Original vs Log-transformed
sns.histplot(raw_feature3, ax=axes[2, 0])
axes[2, 0].set_title('Original Skewed Feature')

sns.histplot(log_transformed, ax=axes[2, 1])
axes[2, 1].set_title('Log-transformed Feature')

plt.tight_layout()
plt.show()

## 4. Bias-Variance Tradeoff

Let's demonstrate the bias-variance tradeoff using polynomial regression with different degrees.

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

# Generate data
X = np.linspace(0, 1, 30).reshape(-1, 1)
y = np.sin(2 * np.pi * X) + np.random.normal(0, 0.1, X.shape[0])

# Create and fit models with different complexities
degrees = [1, 3, 15]  # Different polynomial degrees
plt.figure(figsize=(15, 5))

for i, degree in enumerate(degrees, 1):
    # Create polynomial pipeline
    model = Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('linear', LinearRegression())
    ])
    
    # Fit model
    model.fit(X, y)
    
    # Generate smooth predictions for plotting
    X_test = np.linspace(0, 1, 100).reshape(-1, 1)
    y_pred = model.predict(X_test)
    
    # Plot
    plt.subplot(1, 3, i)
    plt.scatter(X, y, color='blue', alpha=0.5, label='Data')
    plt.plot(X_test, y_pred, color='red', label=f'Degree {degree}')
    plt.title(f'Polynomial Degree {degree}')
    plt.xlabel('X')
    plt.ylabel('y')
    plt.legend()

plt.tight_layout()
plt.show()

print("Observations:")
print("- Degree 1: High bias (underfitting)")
print("- Degree 3: Good balance")
print("- Degree 15: High variance (overfitting)")

## Practice Exercises

1. Create your own dataset and apply different feature engineering techniques.

2. Experiment with different data splitting strategies (e.g., different train-test ratios).

3. Implement k-fold cross-validation to evaluate model performance.

4. Create visualizations to explain the concept of overfitting and underfitting.

5. Practice handling missing data and outliers in a machine learning pipeline.