In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pickle

# **fit(), transform(), fit_transform()**

In [7]:
data = pd.read_csv('Admission_Prediction.csv')
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.0,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


In [8]:
data.isnull().sum()

Serial No.            0
GRE Score            15
TOEFL Score          10
University Rating    15
SOP                   0
LOR                   0
CGPA                  0
Research              0
Chance of Admit       0
dtype: int64

In [9]:
data = data.drop(axis=1, columns=['Serial No.'])
data['GRE Score'] = data['GRE Score'].fillna(method='ffill')
data['TOEFL Score'] = data['TOEFL Score'].fillna(method='ffill')
data['University Rating'] = data['University Rating'].fillna(method='ffill')

In [10]:
X = data.drop(axis=1, columns=['Chance of Admit'])
Y = data['Chance of Admit']

In [21]:
# train_test_split:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=155)

In [22]:
# After splitting apply fiatures tranformation techniques:

scalar = StandardScaler()
x_train_scaled = scalar.fit_transform(x_train)
x_test_scaled = scalar.transform(x_test)

In [23]:
# Get the mean and standard deviation for training data:
x_train_scaled.mean(), x_train_scaled.std()

(2.8557050903882124e-16, 1.0)

In [24]:
# Get the mean and standard deviation for testing data:
x_test_scaled.mean(), x_test_scaled.std()

(0.09301432655881314, 1.02951862108369)

In [25]:
# Model building:

regression_model = LinearRegression()
regression_model.fit(x_train_scaled, y_train)
regression_model.predict(x_test_scaled)

array([0.71294446, 0.57206431, 0.5204978 , 0.72174353, 0.91282926,
       0.64389547, 0.59855718, 0.94943576, 0.70430261, 0.57545354,
       0.75274616, 0.84554981, 0.72468097, 0.77806564, 0.93438904,
       0.82149464, 0.80955992, 0.73648893, 0.69606187, 0.74289015,
       0.72985204, 0.7028358 , 0.59605122, 0.61209012, 0.86653854,
       0.66348958, 0.88661898, 0.8051504 , 0.74095603, 0.72454538,
       0.57152626, 0.62348754, 0.78440587, 0.6044038 , 0.81680677,
       0.59963444, 0.79592632, 0.99983367, 0.7435772 , 0.57769779,
       0.76782679, 0.77964102, 0.85505229, 0.49473367, 0.63465677,
       0.79886854, 0.88311211, 0.42136308, 0.7864413 , 0.6097748 ,
       0.58741813, 0.84613425, 0.94419784, 0.64125364, 0.89340647,
       0.60270652, 0.49796634, 0.58535127, 0.44185485, 0.88440771,
       0.69230315, 0.45074964, 0.69856695, 0.87943815, 0.63759465,
       0.59260414, 0.77185408, 0.77391621, 0.78522043, 0.51225833,
       0.84865906, 0.52318639, 0.69673027, 0.67331944, 0.77476

In [26]:
# Check the R_Square value based on test data:

y_predict_for_test = regression_model.predict(x_test_scaled)
r2_score_test_data = r2_score(y_test, y_predict_for_test)
print(f"Accuracy For Test data (R-Square):  {r2_score_test_data}")

Accuracy For Test data (R-Square):  0.8412555362396706


In [27]:
# Check the R_Square value based on train data:

y_predict_for_train = regression_model.predict(x_train_scaled)
r2_score_train_data = r2_score(y_train, y_predict_for_train)
print(f"Accuracy For Test data (R-Square):  {r2_score_train_data}")

Accuracy For Test data (R-Square):  0.8130686598488777


**Note:**

Here accuracy is little bit improve (after applying tranformation tech.) comparing to the previous example (Linear Regression 01.ipynb file).