# Code Generating a Model Using Linear Regression

Dependencies: numpy, pandas, sklearn

# Generates a model to find what the best thresholds are for each class

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Reads data stored in the student_dropout.csv file which contains our pruned data set
raw_data = pd.read_csv('student_dropout.csv')

# Sets up variables to hold the X, y and the names of columns for the variables in X
y = raw_data['Target']
raw_data = raw_data.drop(columns=['Target'])
column_names = raw_data.columns
X = raw_data.loc[:, column_names]

# Creates the train test split for the data using a 75:25 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Normalizes our train and test variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Adds an extra column to the input variables to include the bias term
X_train_scaled = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_test_scaled = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

# Creates the Linear Regression model for the data
model = LinearRegression()
model.fit(X_train, y_train)

# Gets the predictions from our fitted model
y_pred = model.predict(X_test)

# Prints the coefficients
print(model.coef_)

# sets up variables to test which thresholds produce the best result
threshold = 0
thresh2 = 1
correct = 0
incorrect = 0

# Creates a loop that iterates 9 times and increments the lower threshold by 0.1
for j in range(9):
    threshold = threshold + 0.1
    thresh2 = 1
    # Creates a loop that iterates 9 times and increments the higher threshold by 0.1
    for k in range(9):
        thresh2 = thresh2 + 0.1
        correct = 0
        incorrect = 0
        # loops for every index in y_test and y_pred and checks if the prediction matches
        # the actual value with the given thresholds and tracks the number of correct and
        # incorrect predictions for the given thresholds
        for i, index in enumerate(y_test.index):
            if y_pred[i] > threshold and y_pred[i] < thresh2:
                if y_test[index] == 1:
                    correct = correct + 1
                else:
                    incorrect = incorrect + 1
            if y_pred[i] <= threshold:
                if y_test[index] == 0:
                    correct = correct + 1
                else:
                    incorrect = incorrect + 1
            if y_pred[i] >= thresh2:
                if y_test[index] == 2:
                    correct = correct + 1
                else:
                    incorrect = incorrect + 1
        print("With lower bound =", threshold)
        print("With higher bound =", thresh2)
        print("Correct:", correct)
        print("Incorrect:", incorrect)
        print()


Index(['Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (enrolled)', 'Tuition fees up to date',
       'Curricular units 2nd sem (evaluations)', 'Age at enrollment',
       'Unemployment rate', 'Course'],
      dtype='object')
[ 5.25532738e-02  3.38445161e-02 -7.91143716e-02  4.05875766e-01
  2.55109958e-02 -6.90196708e-03 -6.56108605e-03 -5.81592808e-05]
With lower bound = 0.1
With higher bound = 1.1
Correct: 414
Incorrect: 507

With lower bound = 0.1
With higher bound = 1.2000000000000002
Correct: 505
Incorrect: 416

With lower bound = 0.1
With higher bound = 1.3000000000000003
Correct: 519
Incorrect: 402

With lower bound = 0.1
With higher bound = 1.4000000000000004
Correct: 520
Incorrect: 401

With lower bound = 0.1
With higher bound = 1.5000000000000004
Correct: 520
Incorrect: 401

With lower bound = 0.1
With higher bound = 1.6000000000000005
Correct: 519
Incorrect: 402

With lower bound = 0.1
With higher bound = 1.

Best threshold is 0.9 and 1.5 

right = 654, wrong 267

# Final Linear Regression model using the optimal thresholds found in the previous code cell

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Reads data stored in the student_dropout.csv file which contains our pruned data set
raw_data = pd.read_csv('student_dropout.csv')

# Sets up variables to hold the X, y and the names of columns for the variables in X
y = raw_data['Target']
raw_data = raw_data.drop(columns=['Target'])
column_names = raw_data.columns
X = raw_data.loc[:, column_names]

# Creates the train test split for the data using a 75:25 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Normalizes our train and test variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Adds an extra column to the input variables to include the bias term
X_train_scaled = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_test_scaled = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

# Creates the Linear Regression model for the data
model = LinearRegression()
model.fit(X_train, y_train)

# Gets the predictions from our fitted model
y_pred = model.predict(X_test)

# Sets the best thresholds found from the preivous cell
thresh1 = 0.9
thresh2 = 1.5

# Loop that goes through every prediction and assigns the correct class based
# on the thresholds
for i in range(len(y_pred)):
    if y_pred[i] <= thresh1:
        y_pred[i] = 0
    elif y_pred[i] > thresh1 and y_pred[i] < thresh2:
        y_pred[i] = 1
    elif y_pred[i] > thresh2:
        y_pred[i] = 2

# Finds and prints the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")


Accuracy: 0.7101
