# Linear Regression Analysis

This notebook demonstrates linear regression modeling on your dataset.

## Steps:
1. Load and explore data
2. Data preprocessing
3. Train linear regression model
4. Evaluate model performance
5. Visualize results


In [None]:
# Step 1: Import libraries and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load your CSV (replace with actual filename)
# df = pd.read_csv('your_file.csv')

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()


In [None]:
# Step 2: Explore the data
print("=== Dataset Information ===")
df.info()

print("\n=== Statistical Summary ===")
df.describe()


In [None]:
# Check for missing values
print("=== Missing Values ===")
missing = df.isnull().sum()
print(missing[missing > 0])

# Visualize correlations
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()


In [None]:
# Step 3: Prepare data for modeling
# IMPORTANT: Update these column names to match your dataset!
target_column = 'target'  # Replace with your target variable
feature_columns = ['feature1', 'feature2', 'feature3']  # Replace with your features

# Select features and target
X = df[feature_columns]
y = df[target_column]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")


In [None]:
# Step 4: Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

print("✓ Model trained successfully!")
print(f"\nModel Coefficients:")
for feature, coef in zip(feature_columns, model.coef_):
    print(f"  {feature}: {coef:.4f}")
print(f"\nIntercept: {model.intercept_:.4f}")


In [None]:
# Step 5: Make predictions and evaluate
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate metrics
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
test_mae = mean_absolute_error(y_test, y_pred_test)

print("=== Model Performance ===")
print(f"\nTraining Set:")
print(f"  R² Score: {train_r2:.4f}")
print(f"  RMSE: {train_rmse:.4f}")
print(f"\nTesting Set:")
print(f"  R² Score: {test_r2:.4f}")
print(f"  RMSE: {test_rmse:.4f}")
print(f"  MAE: {test_mae:.4f}")


In [None]:
# Step 6: Visualize results
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Actual vs Predicted
axes[0].scatter(y_test, y_pred_test, alpha=0.6, s=50)
axes[0].plot([y_test.min(), y_test.max()], 
             [y_test.min(), y_test.max()], 
             'r--', lw=3, label='Perfect Prediction')
axes[0].set_xlabel('Actual Values', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Predicted Values', fontsize=12, fontweight='bold')
axes[0].set_title(f'Actual vs Predicted (R² = {test_r2:.4f})', 
                  fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Residuals
residuals = y_test - y_pred_test
axes[1].scatter(y_pred_test, residuals, alpha=0.6, s=50)
axes[1].axhline(y=0, color='r', linestyle='--', lw=3, label='Zero Residual')
axes[1].set_xlabel('Predicted Values', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Residuals', fontsize=12, fontweight='bold')
axes[1].set_title('Residual Plot', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
