# Placement prediction model

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ruchikakumbhar/placement-prediction-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\dhanu\.cache\kagglehub\datasets\ruchikakumbhar\placement-prediction-dataset\versions\1


In [2]:
import pandas as pd

try:
       df = pd.read_csv(path+'/placementdata.csv', encoding='utf-8')
except UnicodeDecodeError:
       df = pd.read_csv(path+'/placementdata.csv', encoding='latin-1')

In [3]:
df.columns

Index(['StudentID', 'CGPA', 'Internships', 'Projects',
       'Workshops/Certifications', 'AptitudeTestScore', 'SoftSkillsRating',
       'ExtracurricularActivities', 'PlacementTraining', 'SSC_Marks',
       'HSC_Marks', 'PlacementStatus'],
      dtype='object')

In [4]:
df.head

<bound method NDFrame.head of       StudentID  CGPA  Internships  Projects  Workshops/Certifications  \
0             1   7.5            1         1                         1   
1             2   8.9            0         3                         2   
2             3   7.3            1         2                         2   
3             4   7.5            1         1                         2   
4             5   8.3            1         2                         2   
...         ...   ...          ...       ...                       ...   
9995       9996   7.5            1         1                         2   
9996       9997   7.4            0         1                         0   
9997       9998   8.4            1         3                         0   
9998       9999   8.9            0         3                         2   
9999      10000   8.4            0         1                         1   

      AptitudeTestScore  SoftSkillsRating ExtracurricularActivities  \
0         

## Handling Missing Data

In [5]:
# Checking for missing values
print(df.isnull().sum())

StudentID                    0
CGPA                         0
Internships                  0
Projects                     0
Workshops/Certifications     0
AptitudeTestScore            0
SoftSkillsRating             0
ExtracurricularActivities    0
PlacementTraining            0
SSC_Marks                    0
HSC_Marks                    0
PlacementStatus              0
dtype: int64


In [6]:
# you can fill missing values with a specific value
df.fillna("missing", inplace=True)

### Or you can fill the previous non-null value

Lets convert our Placement column to a numeric bcz of placement column have values called Placed and NotPlace

In [7]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Encoding categorical variables for PlacementStatus, ExtracurricularActivities, and PlacementTraining
df['PlacementStatus'] = label_encoder.fit_transform(df['PlacementStatus'])
df['ExtracurricularActivities'] = label_encoder.fit_transform(df['ExtracurricularActivities'])
df['PlacementTraining'] = label_encoder.fit_transform(df['PlacementTraining'])

In [8]:
print(df[['PlacementStatus','ExtracurricularActivities','PlacementTraining']].head(20))

    PlacementStatus  ExtracurricularActivities  PlacementTraining
0                 0                          0                  0
1                 1                          1                  1
2                 0                          1                  0
3                 1                          1                  1
4                 1                          1                  1
5                 0                          1                  0
6                 0                          0                  0
7                 0                          1                  1
8                 0                          0                  1
9                 1                          1                  1
10                1                          1                  1
11                1                          1                  1
12                1                          1                  1
13                0                          0                  1
14        

Lets consider our X and Y values

X will be our input
y will be our output and prediction

In [9]:
# Define features (X) and target (y)
X = df.drop(columns=['StudentID', 'PlacementStatus'])
y = df['PlacementStatus']

Lets split our data into 2 sets 1 for training and other for testing

In [10]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Standardize

Let's say you and your friends decided to compare heights, but you have friends from different parts of the world, and the heights are measured in different units (some in inches, some in centimeters, etc.). It becomes hard to compare who is taller or shorter directly because the units are different.

In [11]:
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [12]:
X_train

array([[ 0.47139169, -0.06556468,  1.12252363, ..., -1.66274047,
         1.32455626,  1.51440106],
       [-0.1536667 , -0.06556468, -0.0307047 , ...,  0.60141677,
        -0.78588907,  1.17849265],
       [ 0.78392088, -0.06556468,  1.12252363, ...,  0.60141677,
         0.65305093, -1.06089672],
       ...,
       [-0.46619589, -0.06556468, -1.18393304, ...,  0.60141677,
        -1.36146508, -0.94892725],
       [ 0.47139169, -0.06556468, -1.18393304, ..., -1.66274047,
        -1.36146508, -0.94892725],
       [ 0.78392088, -0.06556468,  1.12252363, ...,  0.60141677,
         1.22862693, -0.27711044]])

### Random Forest Classifier.

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'RandomForestClassifier Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

RandomForestClassifier Accuracy: 0.781
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.83      0.82      1172
           1       0.75      0.71      0.73       828

    accuracy                           0.78      2000
   macro avg       0.78      0.77      0.77      2000
weighted avg       0.78      0.78      0.78      2000



In [14]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)

# Make predictions
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Logistic Regression Accuracy: {accuracy}')
print(f'Logistic Regression Classification Report:\n{report}')


Logistic Regression Accuracy: 0.7945
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      1172
           1       0.74      0.77      0.76       828

    accuracy                           0.79      2000
   macro avg       0.79      0.79      0.79      2000
weighted avg       0.80      0.79      0.79      2000



You should never go with the single model. You need to compair and check which model performs best

In [15]:
!pip install xgboost



In [16]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "SVM": SVC(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

# Evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f'{name} Accuracy: {accuracy}')
    print(f'{name} Classification Report:\n{report}')


Random Forest Accuracy: 0.781
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.83      0.82      1172
           1       0.75      0.71      0.73       828

    accuracy                           0.78      2000
   macro avg       0.78      0.77      0.77      2000
weighted avg       0.78      0.78      0.78      2000

Logistic Regression Accuracy: 0.7945
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      1172
           1       0.74      0.77      0.76       828

    accuracy                           0.79      2000
   macro avg       0.79      0.79      0.79      2000
weighted avg       0.80      0.79      0.79      2000

SVM Accuracy: 0.793
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.82      1172
           1       0.75      0.75      0.75

# Results

In [17]:
import pandas as pd

# Sample Input Data
new_student_data = {
    'CGPA': 7.0,
    'Internships': 7,
    'Projects': 10,
    'Workshops/Certifications': 8,
    'AptitudeTestScore': 75,
    'SoftSkillsRating': 3,
    'ExtracurricularActivities': 'Yes',
    'PlacementTraining': 'No',
    'SSC_Marks': 80,
    'HSC_Marks': 75
}

def predict_placement(input_data):
    # Convert input data to DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Encode categorical variables
    input_df['ExtracurricularActivities'] = label_encoder.transform(input_df['ExtracurricularActivities'])
    input_df['PlacementTraining'] = label_encoder.transform(input_df['PlacementTraining'])
    
    # Standardize features
    input_scaled = scaler.transform(input_df)
    
    # Make prediction using the trained Logistic Regression model
    prediction = logistic_model.predict(input_scaled)
    
    # Convert prediction back to original category
    placement_status = label_encoder.inverse_transform(prediction)[0]
    
    return placement_status

# Predict for the new student
predicted_status = predict_placement(new_student_data)
print(f'The predicted placement status for the new student is: {predicted_status}')


The predicted placement status for the new student is: Yes
