In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

data = pd.read_csv("adm_data.csv", header=0, index_col=0)
y = np.array(data['Chance of Admit '])

In [2]:
data.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [3]:
x_a = np.array(data['GRE Score'])
x_a=x_a.reshape(-1,1)

print("Model with GRE Score feature only")
for test_size in [0.1, 0.25, 0.5]:
    x_train_a, x_test_a, y_train_a, y_test_a = train_test_split(x_a, y, random_state=0, test_size=test_size)

    model_a=LinearRegression()
    reg_a = model_a.fit(x_train_a, y_train_a)

    w_a = reg_a.coef_.T
    b_a = reg_a.intercept_
    print(f"With test_size = {test_size}")
    print(f"w: {w_a}, b: {b_a}")

    predicted_y_train_a = model_a.predict(x_train_a)
    predicted_y_test_a = model_a.predict(x_test_a)

    print(f"RMS train: {np.sqrt(np.mean((y_train_a - predicted_y_train_a) ** 2))}")
    print(f"RMS test : {np.sqrt(np.mean((y_test_a - predicted_y_test_a) ** 2))}\n")


Model with GRE Score feature only
With test_size = 0.1
w: [0.01002557], b: -2.451049501931883
RMS train: 0.08441050324396342
RMS test : 0.08981132566467258

With test_size = 0.25
w: [0.01013045], b: -2.48167469526203
RMS train: 0.083021022942522
RMS test : 0.09085098944853723

With test_size = 0.5
w: [0.01014446], b: -2.488992432700206
RMS train: 0.08580677867657648
RMS test : 0.08415315339607578



In [4]:
x_b = np.column_stack((np.array(data['GRE Score']), np.array(data['University Rating'])))

print("Model with GRE Score, University Rating features")
for test_size in [0.1, 0.25, 0.5]:
    x_train_b, x_test_b, y_train_b, y_test_b = train_test_split(x_b, y, random_state=0, test_size=test_size)

    model_b = LinearRegression()
    reg_b = model_b.fit(x_train_b, y_train_b)

    w_b = reg_b.coef_.T
    b_b = reg_b.intercept_
    print(f"With test_size = {test_size}")
    print(f"w: {w_b}, b: {b_b}")

    predicted_y_train_b = model_b.predict(x_train_b)
    predicted_y_test_b = model_b.predict(x_test_b)

    print(f"RMS train: {np.sqrt(np.mean((y_train_b - predicted_y_train_b) ** 2))}")
    print(f"RMS test : {np.sqrt(np.mean((y_test_b - predicted_y_test_b) ** 2))}\n")

Model with GRE Score, University Rating features
With test_size = 0.1
w: [0.00722614 0.04077061], b: -1.6903642328905972
RMS train: 0.07727599733246558
RMS test : 0.0854296668738817

With test_size = 0.25
w: [0.00740224 0.03959183], b: -1.7410227940803304
RMS train: 0.07608153388498747
RMS test : 0.08403752784766393

With test_size = 0.5
w: [0.00755529 0.03825881], b: -1.7881302976881885
RMS train: 0.07831472114325826
RMS test : 0.07796852504725114



In [11]:
x_c = np.column_stack((
    np.array(data['GRE Score']),
    np.array(data['TOEFL Score']),
    np.array(data['University Rating']),
    np.array(data['SOP']),
    np.array(data['LOR ']),
    np.array(data['CGPA']),
    np.array(data['Research']),
))

print("Model with all available features")
for test_size in [0.1, 0.25, 0.5]:
    x_train_c, x_test_c, y_train_c, y_test_c = train_test_split(x_c, y, random_state=0, test_size=test_size)

    model_c=LinearRegression()
    reg_c = model_c.fit(x_train_c, y_train_c)

    w_c = reg_c.coef_.T
    b_c = reg_c.intercept_
    print(f"With test_size = {test_size}")
    print(f"w: {w_c}, b: {b_c}")

    predicted_y_train_c = model_c.predict(x_train_c)
    predicted_y_test_c = model_c.predict(x_test_c)

    print(f"RMS train: {np.sqrt(np.mean((y_train_c - predicted_y_train_c) ** 2))}")
    print(f"RMS test : {np.sqrt(np.mean((y_test_c - predicted_y_test_c) ** 2))}\n")


Model with all available features
With test_size = 0.1
w: [ 0.00184368  0.0027815   0.00782288 -0.00537863  0.02146922  0.1214326
  0.02145131], b: -1.2944241798749028
RMS train: 0.06203596955517815
RMS test : 0.0727361613459112

With test_size = 0.25
w: [ 0.00191797  0.00231299  0.00481521 -0.0057828   0.02135158  0.13035403
  0.02141325], b: -1.3339694610647306
RMS train: 0.06113112049751964
RMS test : 0.06929284939167152

With test_size = 0.5
w: [0.00151761 0.00170993 0.00070853 0.00217073 0.0192217  0.14201253
 0.01807171], b: -1.2512703159215266
RMS train: 0.06186814213177347
RMS test : 0.06560652615131152



\begin{table}
\begin{tabular}{|l|c|c|c|c|c|c|}
\hline
           & \multicolumn{2}{c|}{GRE\_Score} & \multicolumn{2}{c|}{GRE\_Score, Univ\_Rating} & \multicolumn{2}{c|}{All\_Features} \\ \hline
test\_size & RMS train    & RMS test    & RMS train    & RMS test    & RMS train    & RMS test   \\ \hline
0.1        & 0.08441      & 0.08981     & 0.07728      & 0.08543     & 0.06204      & 0.07274    \\ \hline
0.25       & 0.08302      & 0.09085     & 0.07608      & 0.08404     & 0.06113      & 0.06929    \\ \hline
0.5        & 0.08581      & 0.08415     & 0.07831      & 0.07797     & 0.06187      & 0.06561    \\ \hline
\end{tabular}
\end{table}

* The RMS values for both training and testing decrease with the addition of more features. The model with all available features consistently shows the lowest RMS values across all test sizes, indicating better performance.
* The RMS test values tend to be higher for smaller test sizes (0.1 and 0.25) compared to the largest test size (0.5). This might indicate that the models are more generalized and perform better on larger test sets.
* The RMS train values are relatively consistent across different test sizes for each model, suggesting stable training performance regardless of the size of the test set.
* The model with all available features not only has the lowest RMS values but also shows a more consistent performance between training and testing phases, especially for larger test sets, indicating a well-balanced and robust model.
* These insights demonstrate the importance of feature selection in model performance and suggest that the inclusion of more relevant features can significantly enhance predictive accuracy.