In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
import pickle

%matplotlib inline

In [2]:
data = pd.read_csv("Placement_Data_Full_Class.csv")
data.drop("sl_no",axis=True,inplace=True)
data.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [3]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          215 non-null    object 
 1   ssc_p           215 non-null    float64
 2   ssc_b           215 non-null    object 
 3   hsc_p           215 non-null    float64
 4   hsc_b           215 non-null    object 
 5   hsc_s           215 non-null    object 
 6   degree_p        215 non-null    float64
 7   degree_t        215 non-null    object 
 8   workex          215 non-null    object 
 9   etest_p         215 non-null    float64
 10  specialisation  215 non-null    object 
 11  mba_p           215 non-null    float64
 12  status          215 non-null    object 
 13  salary          148 non-null    float64
dtypes: float64(6), object(8)
memory usage: 23.6+ KB
None


In [4]:
report = data.profile_report(title="Campus Placement Data - Report",
                             progress_bar=False)
report



In [5]:
data["salary"].fillna(value=0,inplace=True)
data["salary"].isnull().sum()

0

In [6]:
from sklearn.preprocessing import LabelEncoder

object_cols = ["gender","workex","specialisation","status"]

label_encoder = LabelEncoder()

for col in object_cols:
    data["col"] = label_encoder.fit_transform(data[col])
data.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary,col
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0,1
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0,1
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0,1
3,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,0.0,0
4,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0,1


In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

In [8]:
X = data[["ssc_p","hsc_p","degree_p","etest_p"]]
y = data["status"]

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=4)

In [10]:
dtree = DecisionTreeClassifier(criterion="entropy")
dtree.fit(X_train,y_train)

y_pred = dtree.predict(X_test)

print("Accuracy Score:",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy Score: 0.8153846153846154
              precision    recall  f1-score   support

  Not Placed       0.72      0.65      0.68        20
      Placed       0.85      0.89      0.87        45

    accuracy                           0.82        65
   macro avg       0.79      0.77      0.78        65
weighted avg       0.81      0.82      0.81        65



In [11]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train,y_train)

y_pred = random_forest.predict(X_test)

print("Accuracy Score:",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy Score: 0.8769230769230769
              precision    recall  f1-score   support

  Not Placed       0.83      0.75      0.79        20
      Placed       0.89      0.93      0.91        45

    accuracy                           0.88        65
   macro avg       0.86      0.84      0.85        65
weighted avg       0.88      0.88      0.88        65



In [12]:
LogR = LogisticRegression(solver="lbfgs")
LogR.fit(X_train,y_train)

y_pred = LogR.predict(X_test)

print("Accuracy Score:",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy Score: 0.8461538461538461
              precision    recall  f1-score   support

  Not Placed       0.78      0.70      0.74        20
      Placed       0.87      0.91      0.89        45

    accuracy                           0.85        65
   macro avg       0.83      0.81      0.81        65
weighted avg       0.84      0.85      0.84        65



In [13]:
pickle.dump(random_forest,open("model_classifier.pkl","wb"))