<h1>CS 556 Project</h1>

<h2>1. Imports</h2>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
plt.style.use(['ggplot'])
%matplotlib inline
#test

<h2>2. Reading CSV </h2>

In [None]:
df = pd.read_csv('College_Admissions.csv')
df.head

<h2>3. Data and Distribution</h2>

In [None]:
datasets = ['GRE Score','CGPA','University Rating']
colors = ['purple', 'green', 'brown']

for dataset,c in zip(datasets,colors):
    plt.hist(df[dataset], bins=30, edgecolor = 'black', color= c)
    plt.title(dataset)
    plt.xlabel(f'{dataset} Distribution')
    plt.ylabel('Count')
    plt.show()
print(df[datasets].describe())

#Calculating correlation matrix
print(df[datasets + ['Chance of Admit ']].corr())



<h2>4. Splitting Datasets</h2>

In [None]:
#Splitting the dataet into a training set and testing set
X = df[datasets]
y = df['Chance of Admit ']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h2>5. Linear Regression Model 1: Without PCA </h2>

In [None]:
# Train original model
model_original = LinearRegression()
model_original.fit(X_train_scaled, y_train)

# Predict
y_train_pred_original = model_original.predict(X_train_scaled)
y_test_pred_original = model_original.predict(X_test_scaled)

# Calculate MSEs
mse_train_original = mean_squared_error(y_train, y_train_pred_original)
mse_test_original = mean_squared_error(y_test, y_test_pred_original)

print(f'Mean Squared Error (Training) for Original Model: {mse_train_original}')
print(f'Mean Squared Error (Testing) for Original Model: {mse_test_original}')

# Save predictions
original_predictions_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_test_pred_original
})
original_predictions_df.to_csv('predictions_original.csv', index=False)


<h2>6. Linear Regression Model 1: With PCA </h2>

In [None]:
# Applying PCA to reduce the dimensions to 2 components
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Visualize the PCA-transformed data
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, cmap='viridis')
plt.title('PCA of Admission Chances (Training Data)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Chance of Admit')
plt.show()


<h2>7. Training Linear Regression Models </h2>

In [None]:
# Train PCA-based model
model_pca = LinearRegression()
model_pca.fit(X_train_pca, y_train)

# Predict
y_train_pred_pca = model_pca.predict(X_train_pca)
y_test_pred_pca = model_pca.predict(X_test_pca)

# Calculate MSEs
mse_train_pca = mean_squared_error(y_train, y_train_pred_pca)
mse_test_pca = mean_squared_error(y_test, y_test_pred_pca)

print(f'Mean Squared Error (Training) for PCA Model: {mse_train_pca}')
print(f'Mean Squared Error (Testing) for PCA Model: {mse_test_pca}')


<h2>8. Creating Scatter Plot </h2>

In [None]:
# Save PCA model predictions
pca_predictions_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_test_pred_pca
})
pca_predictions_df.to_csv('predictions_pca.csv', index=False)

# Scatter plot of PCA points in blue and decision boundary in black
plt.figure(figsize=(8,6))
plt.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c='blue', edgecolor='k', s=40)

# Create a mesh grid to plot decision boundary
x_min, x_max = X_test_pca[:, 0].min() - 1, X_test_pca[:, 0].max() + 1
y_min, y_max = X_test_pca[:, 1].min() - 1, X_test_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                     np.linspace(y_min, y_max, 100))
grid_points = np.c_[xx.ravel(), yy.ravel()]
grid_predictions = model_pca.predict(grid_points)
grid_predictions = grid_predictions.reshape(xx.shape)

# Draw contour line where prediction = 0.5
plt.contour(xx, yy, grid_predictions, levels=[0.5], colors='black')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Scatter Plot with Decision Boundary')
plt.show()

<h2>9. Summary Comparison </h2>

In [None]:
# Display comparison
print("====== Model Performance Summary ======")
print(f'MSE (Training) Original Model: {mse_train_original}')
print(f'MSE (Testing) Original Model: {mse_test_original}')
print(f'MSE (Training) PCA Model: {mse_train_pca}')
print(f'MSE (Testing) PCA Model: {mse_test_pca}')

if mse_test_pca < mse_test_original:
    print("PCA-based model performed better on test data.")
else:
    print("Original model performed better on test data.")