In [1]:
# Importing the pandas library to read and manipulate the dataset
import pandas as pd

# Reading the student performance dataset from the CSV file.
# We specify the separator as ';' since the dataset uses semicolons to separate columns.
data = pd.read_csv(r"C:\Users\MSI-NB\Desktop\Machine_Learning_Projects\Student_Performance_Analysis\student-mat.csv", sep=';')

In [2]:
# Selecting relevant columns for analysis: exam scores, study time, failures, absences, and age.
data = data[['G1', 'G2', 'G3', 'studytime', 'failures', 'absences', 'age']]

In [3]:
# Displaying the first few rows of the dataset to get an overview
data.head()

Unnamed: 0,G1,G2,G3,studytime,failures,absences,age
0,5,6,6,2,0,6,18
1,5,5,6,2,0,4,17
2,7,8,10,2,3,10,15
3,15,14,15,3,0,2,15
4,6,10,10,2,0,4,16


In [4]:
# Renaming the columns for better clarity
data.rename(columns={
    'G1': 'Note 1',  # First period grade
    'G2': 'Note 2',  # Second period grade
    'G3': 'Final',   # Final grade
    'studytime': 'Study Time',  # Time spent studying
    'failures': 'Failures',  # Number of past failures
    'absences': 'Absences',  # Number of absences
    'age': 'Age',  # Age of the student
}, inplace=True)

In [5]:
# Displaying the first few rows after renaming the columns
data.head()

Unnamed: 0,Note 1,Note 2,Final,Study Time,Failures,Absences,Age
0,5,6,6,2,0,6,18
1,5,5,6,2,0,4,17
2,7,8,10,2,3,10,15
3,15,14,15,3,0,2,15
4,6,10,10,2,0,4,16


In [6]:
# Checking the data types of each column
data.dtypes

Note 1        int64
Note 2        int64
Final         int64
Study Time    int64
Failures      int64
Absences      int64
Age           int64
dtype: object

In [7]:
# Importing numpy to handle arrays for model training
import numpy as np

# Defining the target variable (y) as the 'Final' grade (G3)
y = np.array(data['Final'])

# Defining the feature matrix (X) by dropping the target variable column ('Final')
x = np.array(data.drop('Final', axis=1))

In [8]:
# Importing train_test_split from scikit-learn to split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Splitting the data into training and test sets (80% training, 20% test)
# random_state=2 ensures that the split is reproducible
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [9]:
# Importing the LinearRegression model from scikit-learn
from sklearn.linear_model import LinearRegression

# Creating a LinearRegression model instance
lr = LinearRegression()

In [10]:
# Fitting the linear regression model to the training data
lr.fit(x_train, y_train)

In [11]:
# Evaluating the model's performance on the training data (R^2 score)
lr.score(x_train, y_train)

0.8261275475197141

In [12]:
# Evaluating the model's performance on the test data (R^2 score)
lr.score(x_test, y_test)

0.8325898318712226

In [13]:
# Printing the model coefficients for each feature and the intercept
print('Coefs: \n', lr.coef_)
print('Intercept: \n', lr.intercept_)

Coefs: 
 [ 0.19575962  0.95558174 -0.24215827  0.12730705  0.03566925 -0.23224281]
Intercept: 
 2.1549097406751443


In [14]:
# Displaying the first few rows of the dataset after preprocessing
data.head()

Unnamed: 0,Note 1,Note 2,Final,Study Time,Failures,Absences,Age
0,5,6,6,2,0,6,18
1,5,5,6,2,0,4,17
2,7,8,10,2,3,10,15
3,15,14,15,3,0,2,15
4,6,10,10,2,0,4,16


In [15]:
# Making a prediction for a new data point (with example feature values)
# New data: Note 1=10, Note 2=14, Study Time=3, Failures=0, Absences=4, Age=16
new_data = np.array([[10, 14, 3, 0, 4, 16]])

# Predicting the final grade (G3) based on the input data
lr.predict(new_data)

array([13.19096755])

In [16]:
# Making a prediction for another student with different feature values
student3 = np.array([[15, 14, 3, 0, 2, 15]])

# Predicting the final grade for this second student
lr.predict(student3)

array([14.33066999])