In [9]:
import pandas as pd
import numpy as np

# Load data
file_path = r'C:\Users\VIBHANSHU JAIN\Desktop\Client Project\campus-placement-analysis\EDA_Notebooks\datasets\Predicted_data.xlsx'
df = pd.read_excel(file_path)

# Verify data
print("Original columns:", df.columns.tolist())
print("\nMissing values before processing:")
print(df.isnull().sum())

Original columns: ['s_id', 'name', 'profile_link', 'tier', 'gender', 'branch', 'cgpa', 'inter_gpa', 'ssc_gpa', 'internships', 'no_of_projects', 'is_participate_hackathon', 'is_participated_extracurricular', 'no_of_programming_languages', 'dsa', 'mobile_dev', 'web_dev', 'Machine Learning', 'cloud', 'other_skills', 'is_placed', 'salary_as_fresher']

Missing values before processing:
s_id                                10
name                               145
profile_link                        11
tier                                 0
gender                               8
branch                               2
cgpa                                13
inter_gpa                           61
ssc_gpa                             60
internships                          5
no_of_projects                       5
is_participate_hackathon             5
is_participated_extracurricular      5
no_of_programming_languages          5
dsa                                  5
mobile_dev                     

In [11]:
from sklearn.preprocessing import OneHotEncoder

# Define categorical columns to encode
categorical_cols = ['tier', 'gender', 'branch']

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Apply one-hot encoding
encoded_data = encoder.fit_transform(df[categorical_cols])

# Get new column names
encoded_cols = encoder.get_feature_names_out(categorical_cols)

# Create DataFrame with encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=encoded_cols)

# Drop original categorical columns and add encoded ones
df = df.drop(categorical_cols, axis=1)
df = pd.concat([df, encoded_df], axis=1)

print("\nAfter one-hot encoding:")
print(df.head())


After one-hot encoding:
   s_id  name                                       profile_link  cgpa  \
0   1.0   NaN  https://www.linkedin.com/in/shubham-yadav-b862...   NaN   
1   2.0   NaN  https://www.linkedin.com/in/pranav-patel-97780...   NaN   
2   3.0   NaN          https://www.linkedin.com/in/kshitij-garg/   NaN   
3   4.0   NaN   https://www.linkedin.com/in/akash-garg-5757a290/  9.59   
4   5.0   NaN  https://www.linkedin.com/in/shubham-jadhav-493...   NaN   

   inter_gpa  ssc_gpa  internships  no_of_projects  is_participate_hackathon  \
0        NaN     10.0          2.0             5.0                       1.0   
1        NaN     10.0          1.0             4.0                       1.0   
2        NaN      NaN          1.0             5.0                       1.0   
3        NaN     10.0          3.0             8.0                       1.0   
4        NaN      9.8          3.0             4.0                       1.0   

   is_participated_extracurricular  ...  tier_2  

In [12]:
from sklearn.impute import SimpleImputer

# Define numeric columns for mean imputation
numeric_cols = ['cgpa', 'inter_gpa', 'ssc_gpa', 'internships', 
                'no_of_projects', 'no_of_programming_languages']

# Initialize imputer with mean strategy
imputer = SimpleImputer(strategy='mean')

# Apply mean imputation
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# Verify no missing values remain
print("\nMissing values after imputation:")
print(df.isnull().sum())


Missing values after imputation:
s_id                                10
name                               145
profile_link                        11
cgpa                                 0
inter_gpa                            0
ssc_gpa                              0
internships                          0
no_of_projects                       0
is_participate_hackathon             5
is_participated_extracurricular      5
no_of_programming_languages          0
dsa                                  5
mobile_dev                           5
web_dev                              5
Machine Learning                     5
cloud                                5
other_skills                       123
is_placed                            5
salary_as_fresher                    5
tier_1                               0
tier_2                               0
tier_3                               0
gender_F                             0
gender_M                             0
gender_nan                    

In [13]:
df = df.drop(columns=['s_id', 'name', 'profile_link', 'other_skills', 'branch_nan'])
df = df.fillna(0)

In [14]:
print(df.isnull().sum())

cgpa                               0
inter_gpa                          0
ssc_gpa                            0
internships                        0
no_of_projects                     0
is_participate_hackathon           0
is_participated_extracurricular    0
no_of_programming_languages        0
dsa                                0
mobile_dev                         0
web_dev                            0
Machine Learning                   0
cloud                              0
is_placed                          0
salary_as_fresher                  0
tier_1                             0
tier_2                             0
tier_3                             0
gender_F                           0
gender_M                           0
gender_nan                         0
branch_CSE                         0
branch_ECE                         0
branch_EEE                         0
branch_MECH                        0
dtype: int64


In [15]:
# Save the cleaned DataFrame to CSV
output_path = r'C:\Users\VIBHANSHU JAIN\Desktop\Client Project\campus-placement-analysis\EDA_Notebooks\datasets\cleaned_placement_dataset.csv'
df.to_csv(output_path, index=False)  # index=False prevents saving row numbers

print(f"Dataset successfully saved to: {output_path}")
print(f"Total records saved: {len(df)}")

Dataset successfully saved to: C:\Users\VIBHANSHU JAIN\Desktop\Client Project\campus-placement-analysis\EDA_Notebooks\datasets\cleaned_placement_dataset.csv
Total records saved: 145


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load cleaned data
df = pd.read_csv(r'C:\Users\VIBHANSHU JAIN\Desktop\Client Project\campus-placement-analysis\EDA_Notebooks\datasets\cleaned_placement_dataset.csv')

# Separate features and targets
X = df.drop([ 'is_placed', 'salary_as_fresher'], axis=1)
y_placement = df['is_placed']
y_salary = df['salary_as_fresher']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_place_train, y_place_test, y_salary_train, y_salary_test = train_test_split(
    X_scaled, y_placement, y_salary, test_size=0.2, random_state=42
)

In [48]:
from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np

# Assuming X_train is your input feature matrix
scaler = StandardScaler()  # or MinMaxScaler() if you used that
X_scaled = scaler.fit_transform(X_train)

# Save the input scaler
joblib.dump(scaler, 'input_scaler.pkl')


['input_scaler.pkl']

In [39]:
from tensorflow import keras
from tensorflow.keras import layers

input_shape = X_train.shape[1]
inputs = keras.Input(shape=(input_shape,))

# Deep shared layers
x = layers.Dense(256, activation='relu')(inputs)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(64, activation='relu')(x)

# Output layers
placement_out = layers.Dense(1, activation='sigmoid', name='placement')(x)
salary_out = layers.Dense(1, name='salary')(x)

model = keras.Model(inputs=inputs, outputs=[placement_out, salary_out])
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss={
        'placement': 'binary_crossentropy',
        'salary': 'mse'
    },
    metrics={
        'placement': 'accuracy',
        'salary': ['mae', 'mse']
    }
)
model.summary()


In [40]:
from tensorflow.keras import callbacks

early_stopping = callbacks.EarlyStopping(
    patience=10,
    restore_best_weights=True,
    monitor='val_loss'
)

history = model.fit(
    X_train,
    {
        'placement': y_place_train,
        'salary': y_salary_train
    },
    validation_data=(
        X_test,
        {
            'placement': y_place_test,
            'salary': y_salary_test
        }
    ),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 150ms/step - loss: 187.9388 - placement_accuracy: 0.6015 - placement_loss: 0.8229 - salary_loss: 183.1101 - salary_mae: 10.5789 - salary_mse: 187.1226 - val_loss: 168.6797 - val_placement_accuracy: 0.3448 - val_placement_loss: 0.7320 - val_salary_loss: 167.9477 - val_salary_mae: 9.0307 - val_salary_mse: 167.9477
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 166.1128 - placement_accuracy: 0.5787 - placement_loss: 0.8815 - salary_loss: 163.5827 - salary_mae: 10.3572 - salary_mse: 165.2299 - val_loss: 164.0660 - val_placement_accuracy: 0.3448 - val_placement_loss: 0.7449 - val_salary_loss: 163.3211 - val_salary_mae: 8.9827 - val_salary_mse: 163.3211
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 122.0653 - placement_accuracy: 0.5153 - placement_loss: 0.8851 - salary_loss: 126.7763 - salary_mae: 8.9947 - salary_mse: 121.1798 - 

In [41]:
# Evaluate on test set
results = model.evaluate(
    X_test,
    {
        'placement': y_place_test,
        'salary': y_salary_test
    },
    verbose=0
)

# Print metrics
print("\nEvaluation Metrics:")
print(f"Placement Accuracy: {results[3]:.4f}")
print(f"Salary MAE: {results[4]:.2f}")
print(f"Salary MSE: {results[5]:.2f}")

# Generate predictions
placement_pred, salary_pred = model.predict(X_test[:5])
print("\nSample Predictions:")
for i in range(5):
    print(f"Student {i+1}: Placement Prob: {placement_pred[i][0]:.2f} | Pred Salary: {salary_pred[i][0]:.2f}")


Evaluation Metrics:
Placement Accuracy: 0.8966
Salary MAE: 3.41
Salary MSE: 33.24
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step

Sample Predictions:
Student 1: Placement Prob: 0.79 | Pred Salary: 4.52
Student 2: Placement Prob: 0.24 | Pred Salary: 0.82
Student 3: Placement Prob: 1.00 | Pred Salary: 16.55
Student 4: Placement Prob: 0.98 | Pred Salary: 9.15
Student 5: Placement Prob: 1.00 | Pred Salary: 14.94


In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Evaluate Keras model on test set (to get Keras metrics too)
results = model.evaluate(
    X_test,
    {
        'placement': y_place_test,
        'salary': y_salary_test
    },
    verbose=0
)

# Get predictions
placement_pred_probs, salary_pred = model.predict(X_test)
placement_pred_classes = (placement_pred_probs > 0.5).astype(int)  # Convert probabilities to 0 or 1

# ----------- Classification Metrics (Placement) -----------
acc = accuracy_score(y_place_test, placement_pred_classes)
precision = precision_score(y_place_test, placement_pred_classes)
recall = recall_score(y_place_test, placement_pred_classes)
f1 = f1_score(y_place_test, placement_pred_classes)
conf_matrix = confusion_matrix(y_place_test, placement_pred_classes)
class_report = classification_report(y_place_test, placement_pred_classes)

# ----------- Regression Metrics (Salary) -----------
mae = mean_absolute_error(y_salary_test, salary_pred)
mse = mean_squared_error(y_salary_test, salary_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_salary_test, salary_pred)

# ----------- Print Evaluation Metrics -----------
print("\n🧠 Evaluation Metrics:")
print(f"Placement Accuracy : {acc:.4f}")
print(f"Precision           : {precision:.4f}")
print(f"Recall              : {recall:.4f}")
print(f"F1 Score            : {f1:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print("\nClassification Report:\n", class_report)

print("\n📈 Salary Prediction Metrics:")
print(f"MAE   : {mae:.2f}")
print(f"MSE   : {mse:.2f}")
print(f"RMSE  : {rmse:.2f}")
print(f"R²    : {r2:.4f}")

# ----------- Sample Predictions -----------
print("\n🔍 Sample Predictions:")
for i in range(5):
    print(f"Student {i+1}: Placement Prob: {placement_pred_probs[i][0]:.2f} | Pred Salary: {salary_pred[i][0]:.2f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step

🧠 Evaluation Metrics:
Placement Accuracy : 0.8966
Precision           : 0.8696
Recall              : 1.0000
F1 Score            : 0.9302
Confusion Matrix:
[[ 6  3]
 [ 0 20]]

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.67      0.80         9
         1.0       0.87      1.00      0.93        20

    accuracy                           0.90        29
   macro avg       0.93      0.83      0.87        29
weighted avg       0.91      0.90      0.89        29


📈 Salary Prediction Metrics:
MAE   : 3.41
MSE   : 33.24
RMSE  : 5.77
R²    : 0.6373

🔍 Sample Predictions:
Student 1: Placement Prob: 0.79 | Pred Salary: 4.52
Student 2: Placement Prob: 0.24 | Pred Salary: 0.82
Student 3: Placement Prob: 1.00 | Pred Salary: 16.55
Student 4: Placement Prob: 0.98 | Pred Salary: 9.15
Student 5: Placement Prob: 1.00 | Pred Salary: 14.94


In [44]:
model.save("placement_salary_ann_model.h5")


