# 4.3 Relationships in Data Tutorial

This notebook covers key concepts in analyzing relationships in data including:
- Understanding Relationships
- Correlation Analysis
- Simple Linear Regression
- Multiple Linear Regression
- Model Diagnostics

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Set random seed for reproducibility
np.random.seed(42)

## 1. Understanding Relationships

Let's create some example data to demonstrate different types of relationships.

In [None]:
# Generate different types of relationships
x = np.linspace(0, 10, 100)

# Linear relationship
y_linear = 2 * x + 1 + np.random.normal(0, 1, 100)

# Quadratic relationship
y_quadratic = x**2 + np.random.normal(0, 5, 100)

# No relationship (random)
y_random = np.random.normal(5, 2, 100)

# Visualize relationships
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

ax1.scatter(x, y_linear)
ax1.set_title('Linear Relationship')

ax2.scatter(x, y_quadratic)
ax2.set_title('Quadratic Relationship')

ax3.scatter(x, y_random)
ax3.set_title('No Relationship')

plt.tight_layout()
plt.show()

## 2. Correlation Analysis

Let's explore different correlation measures and their interpretations.

In [None]:
# Calculate correlations
def print_correlations(x, y, relationship_type):
    pearson = stats.pearsonr(x, y)
    spearman = stats.spearmanr(x, y)
    kendall = stats.kendalltau(x, y)
    
    print(f"\nCorrelations for {relationship_type} relationship:")
    print(f"Pearson's r: {pearson[0]:.4f} (p-value: {pearson[1]:.4f})")
    print(f"Spearman's rho: {spearman[0]:.4f} (p-value: {spearman[1]:.4f})")
    print(f"Kendall's tau: {kendall[0]:.4f} (p-value: {kendall[1]:.4f})")

print_correlations(x, y_linear, 'Linear')
print_correlations(x, y_quadratic, 'Quadratic')
print_correlations(x, y_random, 'Random')

# Create correlation matrix visualization
data = pd.DataFrame({
    'X': x,
    'Linear': y_linear,
    'Quadratic': y_quadratic,
    'Random': y_random
})

plt.figure(figsize=(8, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## 3. Simple Linear Regression

Let's implement and analyze a simple linear regression model.

In [None]:
# Prepare data for simple linear regression
X = x.reshape(-1, 1)
y = y_linear

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print model results
print(f"Coefficient (slope): {model.coef_[0]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")
print(f"R-squared: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

# Visualize the regression line
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', label='Predicted')
plt.title('Simple Linear Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()

## 4. Multiple Linear Regression

Let's explore multiple linear regression with multiple predictors.

In [None]:
# Generate data for multiple regression
n_samples = 100
X_multi = np.random.normal(size=(n_samples, 3))  # 3 features
y_multi = 2 * X_multi[:, 0] + 0.5 * X_multi[:, 1] - 1 * X_multi[:, 2] + np.random.normal(0, 0.5, n_samples)

# Split data
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42
)

# Fit multiple regression model
model_multi = LinearRegression()
model_multi.fit(X_train_multi, y_train_multi)

# Print coefficients and performance metrics
print("Coefficients:")
for i, coef in enumerate(model_multi.coef_):
    print(f"Feature {i+1}: {coef:.4f}")
print(f"\nIntercept: {model_multi.intercept_:.4f}")

# Make predictions and evaluate
y_pred_multi = model_multi.predict(X_test_multi)
print(f"\nR-squared: {r2_score(y_test_multi, y_pred_multi):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_multi, y_pred_multi)):.4f}")

# Visualize actual vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test_multi, y_pred_multi)
plt.plot([y_test_multi.min(), y_test_multi.max()], [y_test_multi.min(), y_test_multi.max()], 'r--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.show()

## 5. Model Diagnostics

Let's perform diagnostic tests to validate our regression assumptions.

In [None]:
# Calculate residuals
residuals = y_test_multi - y_pred_multi

# Create diagnostic plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))

# Residuals vs Fitted
ax1.scatter(y_pred_multi, residuals)
ax1.axhline(y=0, color='r', linestyle='--')
ax1.set_xlabel('Fitted Values')
ax1.set_ylabel('Residuals')
ax1.set_title('Residuals vs Fitted')

# Q-Q plot
stats.probplot(residuals, dist="norm", plot=ax2)
ax2.set_title('Normal Q-Q Plot')

# Scale-Location
ax3.scatter(y_pred_multi, np.sqrt(np.abs(residuals)))
ax3.set_xlabel('Fitted Values')
ax3.set_ylabel('√|Residuals|')
ax3.set_title('Scale-Location')

# Residuals histogram
ax4.hist(residuals, bins=20)
ax4.set_xlabel('Residuals')
ax4.set_ylabel('Frequency')
ax4.set_title('Residuals Distribution')

plt.tight_layout()
plt.show()

# Perform statistical tests
print("Shapiro-Wilk test for normality of residuals:")
print(stats.shapiro(residuals))

print("\nBreusch-Pagan test for homoscedasticity:")
print(stats.breutsch_pagan(X_test_multi, residuals))

## Practice Exercises

1. Create a dataset with non-linear relationships and explore transformation techniques.

2. Implement polynomial regression and compare it with simple linear regression.

3. Analyze multicollinearity in a multiple regression model using VIF.

4. Perform cross-validation to assess model stability.

5. Handle outliers and influential points in regression analysis.