# Predictive Model for Student Starting Salaries

Authors: Eliska Patockova, Roshan Ravi, Alexa Muratyan

In [18]:
import pandas as pd
import numpy as np
from scipy.stats import multivariate_normal as mvn
from matplotlib import pylab as plt
from itertools import combinations

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

Link to our dataset: https://www.kaggle.com/datasets/benroshan/factors-affecting-campus-placement

## Load the Dataset

In [19]:
data = pd.read_csv("./Placement_Data_Full_Class.csv")
print(data.head())

   sl_no gender  ssc_p    ssc_b  hsc_p    hsc_b     hsc_s  degree_p  \
0      1      M  67.00   Others  91.00   Others  Commerce     58.00   
1      2      M  79.33  Central  78.33   Others   Science     77.48   
2      3      M  65.00  Central  68.00  Central      Arts     64.00   
3      4      M  56.00  Central  52.00  Central   Science     52.00   
4      5      M  85.80  Central  73.60  Central  Commerce     73.30   

    degree_t workex  etest_p specialisation  mba_p      status    salary  
0   Sci&Tech     No     55.0         Mkt&HR  58.80      Placed  270000.0  
1   Sci&Tech    Yes     86.5        Mkt&Fin  66.28      Placed  200000.0  
2  Comm&Mgmt     No     75.0        Mkt&Fin  57.80      Placed  250000.0  
3   Sci&Tech     No     66.0         Mkt&HR  59.43  Not Placed       NaN  
4  Comm&Mgmt     No     96.8        Mkt&Fin  55.50      Placed  425000.0  


## Preprocessing

Modify existing columns (One Hot Encoding)

In [20]:
# OHE on the following: gender, ssc_b, hsc_b, hsc_s, degree_t, workex, specialisation, status
preprocessor = ColumnTransformer(
    transformers = [
        (
            'rescale_continous',
            Pipeline([
                # ('median_imputer', SimpleImputer(strategy='median')),
                ('standard_scaler', MinMaxScaler())
            ]),
            ["ssc_p", "hsc_p", "degree_p", "etest_p", "mba_p"]
            # Scale (to [0,1]) and Impute (fill null values w/ median) all Continous Values to use in the Logistic Regression prediction
        ),
        (
            'encode_categorical',
            OneHotEncoder(handle_unknown="ignore"),
            ["gender", "ssc_b", "hsc_b", "hsc_s", "degree_t", "workex", "specialisation", "status"]
            # Use One-Hot Encoding (OHE) for all Categorical Values to enable usage in the Logistic Regression prediction, otherwise string values cannot be used
        ),
        (
            'passthrough_metadata',
            'passthrough',
            ["sl_no", "salary"]
            # Pass PassengerId & Survived through transformation
        ),
    ],
    sparse_threshold=0,
    verbose_feature_names_out=False
).fit(data)

# Transform Training Data using Preprocessor
transformed_train_data = pd.DataFrame(
    preprocessor.transform(data),
    columns=preprocessor.get_feature_names_out()
)

# Visualize Resulting Distribution & Sample Values
display(transformed_train_data.describe())
display(transformed_train_data.head())
display(transformed_train_data.isnull().sum(axis=0).sort_values(ascending=False))

Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p,gender_F,gender_M,ssc_b_Central,ssc_b_Others,hsc_b_Central,...,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR,status_Not Placed,status_Placed,sl_no,salary
count,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,...,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,148.0
mean,0.544494,0.483248,0.399273,0.460428,0.41485,0.353488,0.646512,0.539535,0.460465,0.390698,...,0.051163,0.274419,0.655814,0.344186,0.55814,0.44186,0.311628,0.688372,108.0,288655.405405
std,0.223195,0.179531,0.179482,0.276582,0.218643,0.479168,0.479168,0.499598,0.499598,0.489045,...,0.220844,0.447262,0.476211,0.476211,0.497767,0.497767,0.46424,0.46424,62.209324,93457.45242
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,200000.0
25%,0.406308,0.39374,0.268293,0.208333,0.252436,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.5,240000.0
50%,0.53824,0.461285,0.390244,0.4375,0.404423,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,108.0,265000.0
75%,0.717584,0.593081,0.536585,0.697917,0.563906,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,161.5,300000.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,215.0,940000.0


Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p,gender_F,gender_M,ssc_b_Central,ssc_b_Others,hsc_b_Central,...,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR,status_Not Placed,status_Placed,sl_no,salary
0,0.53824,0.889621,0.195122,0.104167,0.284483,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,270000.0
1,0.792414,0.68089,0.670244,0.760417,0.564843,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,200000.0
2,0.497011,0.510708,0.341463,0.520833,0.247001,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,3.0,250000.0
3,0.311482,0.247117,0.04878,0.333333,0.308096,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,4.0,
4,0.925788,0.602965,0.568293,0.975,0.160795,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,5.0,425000.0


salary                    67
hsc_s_Science              0
sl_no                      0
status_Placed              0
status_Not Placed          0
specialisation_Mkt&HR      0
specialisation_Mkt&Fin     0
workex_Yes                 0
workex_No                  0
degree_t_Sci&Tech          0
degree_t_Others            0
degree_t_Comm&Mgmt         0
ssc_p                      0
hsc_p                      0
hsc_s_Arts                 0
hsc_b_Others               0
hsc_b_Central              0
ssc_b_Others               0
ssc_b_Central              0
gender_M                   0
gender_F                   0
mba_p                      0
etest_p                    0
degree_p                   0
hsc_s_Commerce             0
dtype: int64

In [21]:
#one-hot encoding for categorical features
# for categorical_feature in ["gender", "ssc_b", "hsc_b", "hsc_s", "degree_t", "workex", "specialisation", "status"]:
#     one_hot_encoded = pd.get_dummies(train[categorical_feature], prefix=categorical_feature)
#     train = pd.concat([train, one_hot_encoded], axis=1)

# train.isnull().sum(axis=0).sort_values(ascending=False)

Analyze data distributions

In [22]:
# First, we want to see the distributions of the data to identify any variables with uninformative distributions



Discard least relevant colums

In [23]:
# code

## KNN Algorithm

In [24]:
# Split dataset into our 3 versions
y = data[data["salary"]]
X1 = df["col1", "col2", "col3"]
X2 = df["col4", "col5", "col6"]
X3 = df["col7", "col8", "col9"]

In [None]:
# Do 3 different versions of KNN with 3 sets of parameters, 
# compare using some graphs, etc, choose most effective one
data1 = data["..."]
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y, test_size=0.2, random_state = 4)
data2 = data["..."]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y, test_size=0.2, random_state = 4)
data3 = data["..."]
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3,y, test_size=0.2, random_state = 4)

Choose the optimal number of neighbors for each of our 3 versions of KNN

In [None]:
def optimal_k(X_train, y_train, X_test, y_test):
    k_range = (1, 25)
    scores = {}
    scores_list = []
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors = k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        scores[k] = metrics.accuracy_score(y_test, y_pred)
        scores_list.append(metrics.accuracy_score(y_test, y_pred))
    return k_range, scores_list

k_range, scores = optimal_k(X_train1, y_train1, X_test1, y_test1)
plt,plot(k_range, scores)
plt.xlabel("Value of K for Dataset 2")
pltylabel("Accuracy")

Determine the optimal column configuration for the algorithm