# Q5. Multiple Linear Regression

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm

In [5]:
df = pd.read_excel("~chetandeshpande/downloads/MLR_data.xlsx")

In [6]:
df.head()

Unnamed: 0,X1,X2,Y
0,15.31,57.3,74.8
1,15.2,63.8,74.0
2,16.25,65.4,72.9
3,14.33,57.0,70.0
4,14.57,63.8,74.9


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X1      20 non-null     float64
 1   X2      20 non-null     float64
 2   Y       20 non-null     float64
dtypes: float64(3)
memory usage: 608.0 bytes


In [8]:
X = df[['X1','X2']]
y = df['Y']

In [11]:
X = sm.add_constant(X)

In [12]:
model = sm.OLS(y, X).fit()

In [13]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.847
Model:                            OLS   Adj. R-squared:                  0.829
Method:                 Least Squares   F-statistic:                     46.99
Date:                Wed, 07 May 2025   Prob (F-statistic):           1.19e-07
Time:                        20:54:59   Log-Likelihood:                -50.526
No. Observations:                  20   AIC:                             107.1
Df Residuals:                      17   BIC:                             110.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         33.1094      7.253      4.565      0.0

In [14]:
# R-Squared = 0.847

In [19]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X) 
rmse = np.sqrt(mean_squared_error(y, y_pred))  
print("RMSE:", rmse)

RMSE: 3.026422801154147


In [20]:
# RMSE = 3.0264

In [21]:
# The F-statistic and test statistical significance at α = 0.05

In [22]:
f_stat = model.fvalue
f_pval = model.f_pvalue

# Print the results
print("F-statistic:", f_stat)
print("p-value for F-statistic:", f_pval)

# Test significance at α = 0.05
alpha = 0.05
if f_pval < alpha:
    print("Reject H0: Model is statistically significant at α = 0.05")
else:
    print("Fail to reject H0: Model is not statistically significant")

F-statistic: 46.98986914191224
p-value for F-statistic: 1.1864362651573306e-07
Reject H0: Model is statistically significant at α = 0.05


In [23]:
# p-value=0.0000001186≪0.05, model is statistically significant at α = 0.05

# Q7. Discriminant Analysis

In [43]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [26]:
data = pd.read_csv("~chetandeshpande/downloads/Discriminant_Analysis_Data_Q7.csv")

In [27]:
data.head()

Unnamed: 0,Group,GRE,GMAT
0,1,2.96,596
1,1,3.14,473
2,1,3.22,482
3,1,3.29,527
4,1,3.69,505


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Group   85 non-null     int64  
 1   GRE     85 non-null     float64
 2   GMAT    85 non-null     int64  
dtypes: float64(1), int64(2)
memory usage: 2.1 KB


In [33]:
data.nunique()

Group     3
GRE      67
GMAT     72
dtype: int64

In [59]:
# i) Data Splitting

In [39]:
X = data[['GRE', 'GMAT']]  
y = data['Group'] 

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 68
Testing set size: 17


In [42]:
data.shape

(85, 3)

In [60]:
# (ii) Linear Discriminant Analysis (LDA)

In [44]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

LinearDiscriminantAnalysis()

In [45]:
y_pred_lda = lda.predict(X_test)

In [61]:
# (iii) Confusion Matrix and determine the Accuracy

In [46]:
print("LDA Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lda))

LDA Confusion Matrix:
[[6 0 0]
 [0 6 0]
 [0 0 5]]


In [47]:
print("\nLDA Classification Report:")
print(classification_report(y_test, y_pred_lda))


LDA Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         6
           3       1.00      1.00      1.00         5

    accuracy                           1.00        17
   macro avg       1.00      1.00      1.00        17
weighted avg       1.00      1.00      1.00        17



In [49]:
print("LDA Accuracy:", accuracy_score(y_test, y_pred_lda))

LDA Accuracy: 1.0


In [62]:
# (iv) Quadratic Discriminant Analysis (QDA)

In [52]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [53]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

QuadraticDiscriminantAnalysis()

In [54]:
y_pred_qda = qda.predict(X_test)

In [55]:
cm_qda = confusion_matrix(y_test, y_pred_qda)
print("QDA Confusion Matrix:\n", cm_qda)

QDA Confusion Matrix:
 [[6 0 0]
 [0 6 0]
 [0 0 5]]


In [56]:
accuracy_qda = accuracy_score(y_test, y_pred_qda)
print("\nQDA Accuracy:", accuracy_qda)


QDA Accuracy: 1.0


In [58]:
report_qda = classification_report(y_test, y_pred_qda, target_names=["Admit (1)", "Not Admit (2)", "Borderline (3)"])
print("\nQDA Classification Report:\n", report_qda)


QDA Classification Report:
                 precision    recall  f1-score   support

     Admit (1)       1.00      1.00      1.00         6
 Not Admit (2)       1.00      1.00      1.00         6
Borderline (3)       1.00      1.00      1.00         5

      accuracy                           1.00        17
     macro avg       1.00      1.00      1.00        17
  weighted avg       1.00      1.00      1.00        17



In [1]:
test

NameError: name 'test' is not defined

In [2]:
test

NameError: name 'test' is not defined