In [8]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
datapath="D:/Machine Learning Datasets/Linear Regression/Admission_Predict.csv"
data=pd.read_csv(datapath)
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [12]:
data.columns=data.columns.str.strip(" ")
data.columns=data.columns.str.replace(" ","_")
data.columns

Index(['Serial_No.', 'GRE_Score', 'TOEFL_Score', 'University_Rating', 'SOP',
       'LOR', 'CGPA', 'Research', 'Chance_of_Admit'],
      dtype='object')

In [3]:
# Normalization and Standardization
def normalize(X, columns):
    """
     Applies feature scaling to the dataframe.
    
    :param X: unnormalized features - data frame of floats
    :param columns: columns to be scaled - list of strings
    
    :return: normalized features - data frame of floats
    
    """
    
    for column in columns:
        
        # Use this if you want Z-Score Normalization (or Standardization).
        # Note that you must play with the learning rate
        # and convergence threshold for better results.        
        # X[column] = (X[column] - X[column].mean()) / X[column].std()
        
        # Use this if you want Mean Normalization.
        # Note that you must play with the learning rate
        # and convergence threshold for better results.        
        # X[column] = (X[column] - X[column].mean()) / (X[column].max() - X[column].min()) or
                
        # Use this if you want Min-Max Scaling (or Min-Max Normalization).
        # Note that you must play with the learning rate
        # and convergence threshold for better results.  
        # X[column] = (X[column] - X[column].min()) / (X[column].max() - X[column].min())
        
        # We will use Min-Max Scaling.
        X[column] = (X[column] - X[column].min()) / (X[column].max() - X[column].min())
        
    return X

In [45]:
# hypothesis function
def h(W,X,b,i):
    y_= np.dot(W,X[i]) + b
#     print("from h {}".format(y_))
    return y_
    

In [104]:
# cost function
def J(W,X,b,Y):
    cost=0
    m,n=X.shape
    for i in range(m):
        cost= cost+(h(W,X,b,i) - Y[i])**2
    cost = cost/(2*m)
#     print("from J {}".format(cost))
    return cost

In [105]:
def G(W,X,b,Y,alpha):
    m,n=X.shape
    dj_dw=np.zeros((n,))
    dj_db=0
    for i in range(m):
        e = h(W,X,b,i) - Y[i]
        for j in range(n):
            dj_dw[j]+=e*X[i][j]
        dj_db+=e
    dj_dw=dj_dw/m
    dj_db=dj_db/m
    
    #now calculate the w,b note w is a vector 
    W= W- alpha*dj_dw
    b= b- alpha*dj_db
#     print("from G {}{}".format(W,b))
    return W,b
    

In [142]:
def fit_line(X,Y):
    m,n=X.shape
    w=np.zeros((n,))
    b=0
    alpha=0.01
    convergence_cost=-1
    i=0
    threshold = 0.01
    cost_diff=J(w,X,b,Y)
    print(cost_diff)
    while cost_diff >= threshold:
        
        i+=1
        i_cost=J(w,X,b,Y)
        w,b=G(w,X,b,Y,alpha)
        n_cost=J(w,X,b,Y)
        cost_diff = i_cost - n_cost
        print("Iteration {} : J={} new_cost={}".format(i,cost_diff,n_cost))
        if i%1000==0:
            print("Iteration {} : J={} new_cost={}".format(i,cost_diff,n_cost))
#             showLineFit(w,b)
#             plt.pause(1)
#             updatePlot(w,b)
#             plt.show()
    return w,b,cost_diff

In [138]:
# predict function

# def fit_line(X,Y):
#     m,n=X.shape
#     w=np.zeros((n,))
#     b=0
#     alpha=0.01
#     convergence_cost=-1
#     i=0
#     while True:
#         i+=1
#         w,b=G(w,X,b,Y,alpha)
#         cost=J(w,X,b,Y)
#         print("Iteration {} : J={}".format(i,cost))
#         if convergence_cost==cost:
#             break
#         else:
# #             convergence=cost
#         if i%1000==0:
#             print("Iteration {} : J={}".format(i,cost))
# #             showLineFit(w,b)
# #             plt.pause(1)
# #             updatePlot(w,b)
# #             plt.show()
#     return w,b,cost

In [143]:
from sklearn.model_selection import train_test_split

X= data.drop(['Chance_of_Admit','Serial_No.'],axis=1)
Y= data['Chance_of_Admit'].to_numpy()

# Select columns to be scaled
columns = ['GRE_Score', 'TOEFL_Score', 'CGPA']

# Min-max scaling
X = normalize(X, columns)
X=X.to_numpy()
# Instead of finding probabilities, we want to calculate the percentages.
Y = Y * 100
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(X_train)
print(y_train)

[[0.64       0.64285714 3.         ... 2.5        0.59935897 1.        ]
 [0.56       0.64285714 3.         ... 3.         0.64102564 0.        ]
 [1.         1.         5.         ... 4.5        0.99679487 1.        ]
 ...
 [0.32       0.46428571 2.         ... 3.         0.45512821 1.        ]
 [0.24       0.25       1.         ... 2.         0.14423077 0.        ]
 [0.48       0.5        2.         ... 3.5        0.46474359 0.        ]]
[80. 63. 97. 74. 58. 56. 80. 66. 93. 64. 64. 83. 47. 76. 71. 66. 79. 78.
 59. 72. 71. 61. 66. 61. 79. 68. 62. 71. 73. 62. 77. 53. 56. 46. 86. 48.
 46. 93. 74. 80. 65. 68. 65. 42. 97. 85. 73. 75. 64. 81. 84. 81. 77. 82.
 71. 77. 82. 71. 84. 80. 64. 78. 56. 77. 73. 94. 42. 69. 78. 69. 75. 71.
 78. 93. 52. 86. 57. 96. 54. 67. 38. 64. 92. 64. 63. 74. 47. 68. 83. 71.
 34. 78. 61. 48. 97. 63. 86. 70. 68. 95. 58. 80. 57. 68. 57. 70. 67. 74.
 58. 78. 64. 75. 89. 72. 72. 75. 94. 59. 73. 65. 75. 97. 80. 82. 81. 92.
 52. 84. 81. 89. 76. 62. 73. 69. 71. 45. 51. 

In [144]:
#get the training data into X,Y

X= X_train
Y= y_train
m,n=X.shape
w=np.zeros((n))
# print(m,n,w)
w,b,cost=fit_line(X,Y)
# print(cost)

2724.6546875
Iteration 1 : J=1611.3996959686085 new_cost=1113.2549915313914
Iteration 2 : J=631.4811682303174 new_cost=481.773823301074
Iteration 3 : J=247.5018195387158 new_cost=234.2720037623582
Iteration 4 : J=97.0400533688265 new_cost=137.2319503935317
Iteration 5 : J=38.08168860826241 new_cost=99.15026178526928
Iteration 6 : J=14.978751092888075 new_cost=84.1715106923812
Iteration 7 : J=5.925692915780914 new_cost=78.24581777660029
Iteration 8 : J=2.3780535264119607 new_cost=75.86776425018833
Iteration 9 : J=0.9877030551465822 new_cost=74.88006119504175
Iteration 10 : J=0.44268346505414513 new_cost=74.4373777299876
Iteration 11 : J=0.2289065984061267 new_cost=74.20847113158148
Iteration 12 : J=0.1449280213734454 new_cost=74.06354311020803
Iteration 13 : J=0.11181212104544613 new_cost=73.95173098916258
Iteration 14 : J=0.09862812221209083 new_cost=73.8531028669505
Iteration 15 : J=0.09325584985937496 new_cost=73.75984701709112
Iteration 16 : J=0.09094602532063334 new_cost=73.6689009

Iteration 170 : J=0.05682395373444393 new_cost=62.79891758758722
Iteration 171 : J=0.056692715228074064 new_cost=62.742224872359145
Iteration 172 : J=0.0565621476363134 new_cost=62.68566272472283
Iteration 173 : J=0.05643224583305084 new_cost=62.62923047888978
Iteration 174 : J=0.05630300473543315 new_cost=62.57292747415435
Iteration 175 : J=0.05617441930470335 new_cost=62.516753054849644
Iteration 176 : J=0.056046484544481245 new_cost=62.46070657030516
Iteration 177 : J=0.05591919550133184 new_cost=62.40478737480383
Iteration 178 : J=0.05579254726412586 new_cost=62.348994827539705
Iteration 179 : J=0.05566653496320839 new_cost=62.2933282925765
Iteration 180 : J=0.05554115377092472 new_cost=62.23778713880557
Iteration 181 : J=0.05541639890050476 new_cost=62.18237073990507
Iteration 182 : J=0.05529226560585698 new_cost=62.12707847429921
Iteration 183 : J=0.0551687491813766 new_cost=62.071909725117834
Iteration 184 : J=0.055045844961725265 new_cost=62.01686388015611
Iteration 185 : J=0.0

Iteration 344 : J=0.04089270006704737 new_cost=54.469207686993954
Iteration 345 : J=0.04082799418578986 new_cost=54.428379692808164
Iteration 346 : J=0.0407634891555162 new_cost=54.38761620365265
Iteration 347 : J=0.04069918376360704 new_cost=54.34691701988904
Iteration 348 : J=0.04063507680725564 new_cost=54.306281943081785
Iteration 349 : J=0.040571167093141014 new_cost=54.265710775988644
Iteration 350 : J=0.04050745343683815 new_cost=54.225203322551806
Iteration 351 : J=0.04044393466382701 new_cost=54.18475938788798
Iteration 352 : J=0.04038060960871093 new_cost=54.14437877827927
Iteration 353 : J=0.04031747711485423 new_cost=54.104061301164414
Iteration 354 : J=0.04025453603514961 new_cost=54.063806765129264
Iteration 355 : J=0.04019178523136446 new_cost=54.0236149798979
Iteration 356 : J=0.04012922357397741 new_cost=53.98348575632392
Iteration 357 : J=0.04006684994252652 new_cost=53.943418906381396
Iteration 358 : J=0.04000466322538898 new_cost=53.90341424315601
Iteration 359 : J=

Iteration 512 : J=0.0321356370664887 new_cost=48.39049687516504
Iteration 513 : J=0.03209307225827018 new_cost=48.35840380290677
Iteration 514 : J=0.03205059290662149 new_cost=48.32635321000015
Iteration 515 : J=0.03200819865993765 new_cost=48.29434501134021
Iteration 516 : J=0.03196588916910059 new_cost=48.26237912217111
Iteration 517 : J=0.0319236640870173 new_cost=48.230455458084094
Iteration 518 : J=0.03188152306931613 new_cost=48.19857393501478
Iteration 519 : J=0.03183946577367891 new_cost=48.1667344692411
Iteration 520 : J=0.031797491860068305 new_cost=48.13493697738103
Iteration 521 : J=0.03175560099075625 new_cost=48.103181376390275
Iteration 522 : J=0.03171379283039499 new_cost=48.07146758355988
Iteration 523 : J=0.03167206704551262 new_cost=48.03979551651437
Iteration 524 : J=0.03163042330529464 new_cost=48.00816509320907
Iteration 525 : J=0.03158886128065319 new_cost=47.97657623192842
Iteration 526 : J=0.03154738064514362 new_cost=47.945028851283276
Iteration 527 : J=0.0315

Iteration 688 : J=0.02571715922954354 new_cost=43.33125323024505
Iteration 689 : J=0.02568576577886006 new_cost=43.30556746446619
Iteration 690 : J=0.025654420066757666 new_cost=43.279913044399436
Iteration 691 : J=0.025623121968017415 new_cost=43.25428992243142
Iteration 692 : J=0.025591871358386697 new_cost=43.22869805107303
Iteration 693 : J=0.0255606681142595 new_cost=43.20313738295877
Iteration 694 : J=0.02552951211227139 new_cost=43.1776078708465
Iteration 695 : J=0.025498403230145072 new_cost=43.152109467616356
Iteration 696 : J=0.02546734134587325 new_cost=43.12664212627048
Iteration 697 : J=0.025436326338116544 new_cost=43.101205799932366
Iteration 698 : J=0.025405358086196372 new_cost=43.07580044184617
Iteration 699 : J=0.025374436470080752 new_cost=43.05042600537609
Iteration 700 : J=0.02534356137014271 new_cost=43.025082444005946
Iteration 701 : J=0.0253127326673237 new_cost=42.99976971133862
Iteration 702 : J=0.02528195024358837 new_cost=42.974487761095034
Iteration 703 : 

Iteration 862 : J=0.0208838637397335 new_cost=39.29656816780123
Iteration 863 : J=0.020859306854703163 new_cost=39.27570886094653
Iteration 864 : J=0.02083478255433846 new_cost=39.25487407839219
Iteration 865 : J=0.020810290778101148 new_cost=39.23406378761409
Iteration 866 : J=0.020785831465531146 new_cost=39.21327795614856
Iteration 867 : J=0.020761404556594698 new_cost=39.192516551591964
Iteration 868 : J=0.0207370099911941 new_cost=39.17177954160077
Iteration 869 : J=0.020712647709871135 new_cost=39.1510668938909
Iteration 870 : J=0.02068831765280521 new_cost=39.130378576238094
Iteration 871 : J=0.020664019760673114 new_cost=39.10971455647742
Iteration 872 : J=0.020639753974599273 new_cost=39.08907480250282
Iteration 873 : J=0.020615520235367057 new_cost=39.068459282267455
Iteration 874 : J=0.020591318484378007 new_cost=39.04786796378308
Iteration 875 : J=0.020567148662969714 new_cost=39.02730081512011
Iteration 876 : J=0.020543010712920307 new_cost=39.00675780440719
Iteration 877 

Iteration 1037 : J=0.017035884859559758 new_cost=35.993066184067985
Iteration 1038 : J=0.017016254318157564 new_cost=35.97604992974983
Iteration 1039 : J=0.016996648131062386 new_cost=35.959053281618765
Iteration 1040 : J=0.01697706626138995 new_cost=35.942076215357375
Iteration 1041 : J=0.01695750867252599 new_cost=35.92511870668485
Iteration 1042 : J=0.016937975328076504 new_cost=35.90818073135677
Iteration 1043 : J=0.016918466191214065 new_cost=35.89126226516556
Iteration 1044 : J=0.01689898122590705 new_cost=35.87436328393965
Iteration 1045 : J=0.016879520395491454 new_cost=35.85748376354416
Iteration 1046 : J=0.016860083664013814 new_cost=35.840623679880146
Iteration 1047 : J=0.016840670995186713 new_cost=35.82378300888496
Iteration 1048 : J=0.0168212823530709 new_cost=35.80696172653189
Iteration 1049 : J=0.016801917701549485 new_cost=35.79015980883034
Iteration 1050 : J=0.016782577004860855 new_cost=35.77337723182548
Iteration 1051 : J=0.01676326022710839 new_cost=35.756613971598

Iteration 1204 : J=0.014069562273178349 new_cost=33.40564023925523
Iteration 1205 : J=0.014053546778029613 new_cost=33.3915866924772
Iteration 1206 : J=0.014037550491408979 new_cost=33.37754914198579
Iteration 1207 : J=0.014021573387367425 new_cost=33.36352756859842
Iteration 1208 : J=0.014005615440133568 new_cost=33.34952195315829
Iteration 1209 : J=0.013989676623815228 new_cost=33.33553227653447
Iteration 1210 : J=0.01397375691269076 new_cost=33.32155851962178
Iteration 1211 : J=0.013957856281059833 new_cost=33.30760066334072
Iteration 1212 : J=0.013941974703158166 new_cost=33.293658688637564
Iteration 1213 : J=0.013926112153512804 new_cost=33.27973257648405
Iteration 1214 : J=0.013910268606451837 new_cost=33.2658223078776
Iteration 1215 : J=0.01389444403648099 new_cost=33.25192786384112
Iteration 1216 : J=0.013878638418134415 new_cost=33.238049225422984
Iteration 1217 : J=0.013862851725953362 new_cost=33.22418637369703
Iteration 1218 : J=0.013847083934564353 new_cost=33.210339289762

Iteration 1378 : J=0.011551221329295203 new_cost=31.185428762490467
Iteration 1379 : J=0.011538194813688563 new_cost=31.17389056767678
Iteration 1380 : J=0.011525183613191103 new_cost=31.162365384063587
Iteration 1381 : J=0.011512187708405008 new_cost=31.150853196355182
Iteration 1382 : J=0.011499207080134966 new_cost=31.139353989275047
Iteration 1383 : J=0.01148624170893342 new_cost=31.127867747566114
Iteration 1384 : J=0.011473291575747169 new_cost=31.116394455990367
Iteration 1385 : J=0.011460356661160631 new_cost=31.104934099329206
Iteration 1386 : J=0.011447436946305345 new_cost=31.0934866623829
Iteration 1387 : J=0.011434532411680465 new_cost=31.08205212997122
Iteration 1388 : J=0.011421643038399765 new_cost=31.07063048693282
Iteration 1389 : J=0.011408768807314118 new_cost=31.059221718125507
Iteration 1390 : J=0.011395909699352558 new_cost=31.047825808426154
Iteration 1391 : J=0.011383065695394379 new_cost=31.03644274273076
Iteration 1392 : J=0.011370236776691911 new_cost=31.025

In [145]:
print(w,b,cost)

[8.59474288 7.86073983 2.44910918 2.02142552 5.85293494 8.59898379
 2.74338256] 21.877047733799674 0.009990797660517359


In [146]:
#plot the line
# check the prediction and calculate accuracy
# learn feature scaling
# check convergence
# model performance using curve or cost performance
# documentation
