# Outlier Analysis for Linear Regression Model

This notebook analyzes the impact of removing outliers on the Mean Squared Error (MSE) of our linear regression model.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## 1. Load and Explore the Data

In [None]:
# Load the data
df = pd.read_csv('data.csv')

# Display basic statistics
print("Dataset shape:", df.shape)
print("\nBasic statistics:")
print(df.describe())

# Check for negative or zero values (which might be errors in this context)
print("\nNegative SqrMtr values:", df[df['SqrMtr'] <= 0].shape[0])
print("Negative Price values:", df[df['Price'] <= 0].shape[0])

## 2. Visualize the Data to Identify Outliers

In [None]:
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df['SqrMtr'], df['Price'], alpha=0.5)
plt.title('Price vs Square Meters')
plt.xlabel('Square Meters')
plt.ylabel('Price')
plt.grid(True, alpha=0.3)
plt.show()

# Create box plots to identify outliers
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
sns.boxplot(y=df['SqrMtr'], ax=ax1)
ax1.set_title('Square Meters Distribution')
sns.boxplot(y=df['Price'], ax=ax2)
ax2.set_title('Price Distribution')
plt.tight_layout()
plt.show()

## 3. Baseline Model (Without Outlier Removal)

In [None]:
# Split the data into training and testing sets
X = df[['SqrMtr']]
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
baseline_mse = mean_squared_error(y_test, y_pred)
baseline_rmse = np.sqrt(baseline_mse)
baseline_r2 = r2_score(y_test, y_pred)

print(f"Baseline Model (No Outlier Removal):")
print(f"Intercept: {model.intercept_:.2f}")
print(f"Coefficient: {model.coef_[0]:.2f}")
print(f"Mean Squared Error: {baseline_mse:.2f}")
print(f"Root Mean Squared Error: {baseline_rmse:.2f}")
print(f"R-squared: {baseline_r2:.4f}")

# Visualize the baseline model
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='blue', alpha=0.5, label='Actual')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
plt.title('Baseline Model: Price vs Square Meters')
plt.xlabel('Square Meters')
plt.ylabel('Price')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 4. Identify Outliers Using Different Methods

In [None]:
# Method 1: Z-score method (identifies values more than 3 standard deviations from the mean)
def identify_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    return df[z_scores > threshold].index

# Method 2: IQR method (identifies values outside 1.5 * IQR from Q1 and Q3)
def identify_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)].index

# Method 3: Remove negative values (which are likely errors in this context)
def identify_negative_values(df, column):
    return df[df[column] <= 0].index

# Identify outliers using different methods
sqrmtr_outliers_zscore = identify_outliers_zscore(df, 'SqrMtr')
price_outliers_zscore = identify_outliers_zscore(df, 'Price')

sqrmtr_outliers_iqr = identify_outliers_iqr(df, 'SqrMtr')
price_outliers_iqr = identify_outliers_iqr(df, 'Price')

negative_sqrmtr = identify_negative_values(df, 'SqrMtr')
negative_price = identify_negative_values(df, 'Price')

# Combine outliers from both columns
zscore_outliers = list(set(sqrmtr_outliers_zscore) | set(price_outliers_zscore))
iqr_outliers = list(set(sqrmtr_outliers_iqr) | set(price_outliers_iqr))
negative_outliers = list(set(negative_sqrmtr) | set(negative_price))

print(f"Number of outliers identified by Z-score method: {len(zscore_outliers)}")
print(f"Number of outliers identified by IQR method: {len(iqr_outliers)}")
print(f"Number of negative values: {len(negative_outliers)}")

## 5. Model with Outliers Removed (Z-score method)

In [None]:
# Remove outliers identified by Z-score method
df_no_zscore_outliers = df.drop(zscore_outliers)
print(f"Original dataset shape: {df.shape}")
print(f"Dataset shape after removing Z-score outliers: {df_no_zscore_outliers.shape}")

# Split the data
X_clean = df_no_zscore_outliers[['SqrMtr']]
y_clean = df_no_zscore_outliers['Price']
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean, test_size=0.25, random_state=42
)

# Train the model
model_clean = LinearRegression()
model_clean.fit(X_train_clean, y_train_clean)

# Make predictions
y_pred_clean = model_clean.predict(X_test_clean)

# Calculate metrics
zscore_mse = mean_squared_error(y_test_clean, y_pred_clean)
zscore_rmse = np.sqrt(zscore_mse)
zscore_r2 = r2_score(y_test_clean, y_pred_clean)

print(f"\nModel with Z-score Outliers Removed:")
print(f"Intercept: {model_clean.intercept_:.2f}")
print(f"Coefficient: {model_clean.coef_[0]:.2f}")
print(f"Mean Squared Error: {zscore_mse:.2f}")
print(f"Root Mean Squared Error: {zscore_rmse:.2f}")
print(f"R-squared: {zscore_r2:.4f}")

# Visualize the model
plt.figure(figsize=(10, 6))
plt.scatter(X_test_clean, y_test_clean, color='blue', alpha=0.5, label='Actual')
plt.plot(X_test_clean, y_pred_clean, color='red', linewidth=2, label='Predicted')
plt.title('Model with Z-score Outliers Removed: Price vs Square Meters')
plt.xlabel('Square Meters')
plt.ylabel('Price')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 6. Model with Outliers Removed (IQR method)

In [None]:
# Remove outliers identified by IQR method
df_no_iqr_outliers = df.drop(iqr_outliers)
print(f"Original dataset shape: {df.shape}")
print(f"Dataset shape after removing IQR outliers: {df_no_iqr_outliers.shape}")

# Split the data
X_clean = df_no_iqr_outliers[['SqrMtr']]
y_clean = df_no_iqr_outliers['Price']
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean, test_size=0.25, random_state=42
)

# Train the model
model_clean = LinearRegression()
model_clean.fit(X_train_clean, y_train_clean)

# Make predictions
y_pred_clean = model_clean.predict(X_test_clean)

# Calculate metrics
iqr_mse = mean_squared_error(y_test_clean, y_pred_clean)
iqr_rmse = np.sqrt(iqr_mse)
iqr_r2 = r2_score(y_test_clean, y_pred_clean)

print(f"\nModel with IQR Outliers Removed:")
print(f"Intercept: {model_clean.intercept_:.2f}")
print(f"Coefficient: {model_clean.coef_[0]:.2f}")
print(f"Mean Squared Error: {iqr_mse:.2f}")
print(f"Root Mean Squared Error: {iqr_rmse:.2f}")
print(f"R-squared: {iqr_r2:.4f}")

# Visualize the model
plt.figure(figsize=(10, 6))
plt.scatter(X_test_clean, y_test_clean, color='blue', alpha=0.5, label='Actual')
plt.plot(X_test_clean, y_pred_clean, color='red', linewidth=2, label='Predicted')
plt.title('Model with IQR Outliers Removed: Price vs Square Meters')
plt.xlabel('Square Meters')
plt.ylabel('Price')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 7. Model with Only Negative Values Removed

In [None]:
# Remove negative values
df_no_negatives = df.drop(negative_outliers)
print(f"Original dataset shape: {df.shape}")
print(f"Dataset shape after removing negative values: {df_no_negatives.shape}")

# Split the data
X_clean = df_no_negatives[['SqrMtr']]
y_clean = df_no_negatives['Price']
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean, test_size=0.25, random_state=42
)

# Train the model
model_clean = LinearRegression()
model_clean.fit(X_train_clean, y_train_clean)

# Make predictions
y_pred_clean = model_clean.predict(X_test_clean)

# Calculate metrics
neg_mse = mean_squared_error(y_test_clean, y_pred_clean)
neg_rmse = np.sqrt(neg_mse)
neg_r2 = r2_score(y_test_clean, y_pred_clean)

print(f"\nModel with Negative Values Removed:")
print(f"Intercept: {model_clean.intercept_:.2f}")
print(f"Coefficient: {model_clean.coef_[0]:.2f}")
print(f"Mean Squared Error: {neg_mse:.2f}")
print(f"Root Mean Squared Error: {neg_rmse:.2f}")
print(f"R-squared: {neg_r2:.4f}")

# Visualize the model
plt.figure(figsize=(10, 6))
plt.scatter(X_test_clean, y_test_clean, color='blue', alpha=0.5, label='Actual')
plt.plot(X_test_clean, y_pred_clean, color='red', linewidth=2, label='Predicted')
plt.title('Model with Negative Values Removed: Price vs Square Meters')
plt.xlabel('Square Meters')
plt.ylabel('Price')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 8. Compare All Models

In [None]:
# Create a comparison table
models = ['Baseline (No Outlier Removal)', 'Z-score Outlier Removal', 'IQR Outlier Removal', 'Negative Values Removal']
mse_values = [baseline_mse, zscore_mse, iqr_mse, neg_mse]
rmse_values = [baseline_rmse, zscore_rmse, iqr_rmse, neg_rmse]
r2_values = [baseline_r2, zscore_r2, iqr_r2, neg_r2]

comparison_df = pd.DataFrame({
    'Model': models,
    'MSE': mse_values,
    'RMSE': rmse_values,
    'R-squared': r2_values
})

print("Model Comparison:")
print(comparison_df)

# Visualize the comparison
plt.figure(figsize=(12, 6))
plt.bar(models, rmse_values, color=['blue', 'green', 'orange', 'red'])
plt.title('RMSE Comparison Across Models')
plt.ylabel('RMSE')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
plt.bar(models, r2_values, color=['blue', 'green', 'orange', 'red'])
plt.title('R-squared Comparison Across Models')
plt.ylabel('R-squared')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 9. Conclusion

Based on the analysis above, we can draw the following conclusions:

1. **Impact of Outlier Removal**: The comparison shows how removing outliers affects the model's performance metrics (MSE, RMSE, and R-squared).

2. **Best Method**: The method that produces the lowest MSE/RMSE and highest R-squared is likely the most effective for this dataset.

3. **Trade-offs**: Removing outliers reduces the dataset size, which might affect the model's generalizability. However, it can significantly improve the model's accuracy for the majority of cases.

4. **Recommendation**: Based on the results, we can recommend whether outlier removal is beneficial for this particular dataset and which method is most effective.