In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load dataset
df = pd.read_csv("/Users/kruthikasaisree/Downloads/Civil_Engineering_Regression_Dataset.csv")

# Define independent (X) and dependent (Y) variables
X_multi = df[['Building_Height', 'Material_Quality_Index', 'Labor_Cost', 'Concrete_Strength', 'Foundation_Depth']]
y_multi = df['Construction_Cost']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_multi, y_multi, test_size=0.2, random_state=42)

# Train Multiple Linear Regression model
multi_model = LinearRegression()
multi_model.fit(X_train, y_train)

# Predictions
y_pred = multi_model.predict(X_test)

# Model Evaluation
r2_multi = r2_score(y_test, y_pred)
mse_multi = mean_squared_error(y_test, y_pred)

# Compute Adjusted R-squared
n, p = X_test.shape  # Number of observations & predictors
adjusted_r2_multi = 1 - (1 - r2_multi) * (n - 1) / (n - p - 1)

# Compute VIF for each predictor
vif_data = pd.DataFrame()
vif_data["Feature"] = X_multi.columns
vif_data["VIF"] = [variance_inflation_factor(X_multi.values, i) for i in range(X_multi.shape[1])]

# Print Results
print("\n🔹 Model Performance")
print(f"Multiple Linear Regression R²: {r2_multi:.4f}")
print(f"Adjusted R²: {adjusted_r2_multi:.4f}")
print(f"Mean Squared Error (MSE): {mse_multi:.2f}")

print("\n🔹 Key Cost Factors (Regression Coefficients)")
coef_dict = dict(zip(X_multi.columns, multi_model.coef_))
for feature, coef in coef_dict.items():
    print(f"{feature}: {coef:.2f}")

# Identify most impactful variable
most_impactful_var = max(coef_dict, key=lambda k: abs(coef_dict[k]))
print(f"\n💡 Most impactful variable on Construction Cost: {most_impactful_var}")

print("\n🔹 Multicollinearity Check (VIF Scores)")
print(vif_data)

# Suggested Improvements
print("\n🚀 Suggestions for Model Improvement")
print("1️⃣ Include additional variables: market conditions, weather, project complexity.")
print("2️⃣ Handle multicollinearity by combining correlated factors (e.g., Material Quality & Concrete Strength).")
print("3️⃣ Try non-linear models like Polynomial Regression or Decision Trees for better cost prediction.")

# Model Interpretation & Conclusion
print("\n📌 Key Takeaways")
print("- Multiple Linear Regression significantly improves cost estimation accuracy compared to Simple Linear Regression.")
print("- Building Height, Foundation Depth, and Concrete Strength are the most influential factors in Construction Cost.")
print("- Multicollinearity exists between Material Quality Index and Concrete Strength, which should be addressed.")

print("\n🏗️ Applications for Construction Companies")
print("- Helps estimate costs effectively and optimize budgets.")
print("- Identifies key cost-driving factors for better financial planning.")
print("- Reduces risks by analyzing historical cost trends.")

print("\n⚠️ Limitations")
print("- The model assumes linear relationships, which might not always hold true.")
print("- External factors like inflation, location, and market conditions are not included.")
print("- Multicollinearity may distort coefficient interpretations.")

print("\n🔍 Future Improvements")
print("- Incorporate more variables like labor productivity, material supply chain data, and real estate market trends.")
print("- Use machine learning models such as Random Forest or Neural Networks for improved predictions.")

print("\n📢 Conclusion: The Role of Data Science in Construction Cost Optimization")
print("- Regression analysis helps make data-driven decisions, improving cost efficiency.")
print("- Combining statistical analysis with real-world data enhances project planning and reduces budget overruns.")
print("- The integration of data science in civil engineering ensures cost-effective, optimized, and sustainable construction projects.")



🔹 Model Performance
Multiple Linear Regression R²: 0.9998
Adjusted R²: 0.9997
Mean Squared Error (MSE): 113.50

🔹 Key Cost Factors (Regression Coefficients)
Building_Height: 49.81
Material_Quality_Index: 10.33
Labor_Cost: 0.53
Concrete_Strength: 20.20
Foundation_Depth: 30.14

💡 Most impactful variable on Construction Cost: Building_Height

🔹 Multicollinearity Check (VIF Scores)
                  Feature       VIF
0         Building_Height  4.808162
1  Material_Quality_Index  8.837148
2              Labor_Cost  7.128474
3       Concrete_Strength  9.242819
4        Foundation_Depth  6.072498

🚀 Suggestions for Model Improvement
1️⃣ Include additional variables: market conditions, weather, project complexity.
2️⃣ Handle multicollinearity by combining correlated factors (e.g., Material Quality & Concrete Strength).
3️⃣ Try non-linear models like Polynomial Regression or Decision Trees for better cost prediction.

📌 Key Takeaways
- Multiple Linear Regression significantly improves cost est