# EDA_and_Model_v2.ipynb 

In [None]:



import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Cell 2: Load updated dataset
df = pd.read_csv("../data/student_scores_extended.csv")
df.head()

# Cell 3: Dataset Info
df.info()
df.describe()

# Cell 4: Correlation Heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Cell 5: Top Positive/Negative Correlations
correlations = df.corr()['Marks_Obtained'].drop('Marks_Obtained').sort_values()
print("\nTop Features Affecting Marks Negatively:")
print(correlations.head(3))
print("\nTop Features Affecting Marks Positively:")
print(correlations.tail(3))

# Cell 6: Feature Selection & Split
X = df.drop(columns=['Marks_Obtained'])
y = df['Marks_Obtained']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cell 7: Train Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# Cell 8: Predict & Evaluate
y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

# Cell 9: Plot predictions vs actual
plt.scatter(y_test, y_pred, color='green')
plt.xlabel("Actual Marks")
plt.ylabel("Predicted Marks")
plt.title("Actual vs Predicted Marks")
plt.grid(True)
plt.show()
