In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

%matplotlib inline

In [26]:
df = pd.read_csv("Placement_Data_Full_Class.csv")

In [27]:
df.drop(['sl_no'], axis=1, inplace=True)

***EDA ON PLACEMENT DATA SET***


In [None]:
#getting the number of rows and columns in the dataset
df.shape

In [None]:
df.info()#info about dataset

In [None]:
df.isnull().sum() #check nulls

In [29]:
df.duplicated().sum() #check for there is duplicated values in the dataset

In [28]:
#labelling the target column
df['status'].values[df['status']=='Not Placed'] = 0 
df['status'].values[df['status']=='Placed'] = 1
df.status = df.status.astype('int')

In [30]:
X = df.drop(['status'], axis=1)
Y = df.status

In [31]:
def seperate_cols(X):
  cc = [cname for cname in X.columns if X[cname].dtype == "object"]
  nc = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
  return cc, nc
#Preprocessing for numerical columns
numerical_transformer = SimpleImputer(strategy='constant', fill_value=0)

#Preprocessing pipeline for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
categorical_cols, numerical_cols = seperate_cols(X)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [32]:
my_pip = Pipeline(steps=[
                         ('preprocessor', preprocessor),
                         ('model', RandomForestClassifier(n_estimators=50, criterion='entropy', random_state=0))
])

In [23]:
def train_model(X, Y):
  X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, train_size=0.8, 
                                                                test_size=0.2, random_state=12)
  my_pip.fit(X_train, Y_train)
  preds = my_pip.predict(X_valid)
  acc = accuracy_score(Y_valid, preds)
  return acc

def cross_val_test(X, y):
  scores = cross_val_score(my_pip, X, Y, cv=5)
  avg_score = sum(scores)/len(scores)
  return avg_score

In [24]:
a1 = train_model(X, Y)
a2 = cross_val_test(X, Y)
print(a1, a2)

1.0 1.0
