In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
file = 'data.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,EducationSector,IndividualProject,Age,Gender,City,Influenced,Perseverance,DesireToTakeInitiative,Competitiveness,SelfReliance,StrongNeedToAchieve,SelfConfidence,GoodPhysicalHealth,MentalDisorder,KeyTraits,ReasonsForLack,y
0,Engineering Sciences,No,19,Male,Yes,No,2,2,3,3,2,2,3,Yes,Passion,,1
1,Engineering Sciences,Yes,22,Male,No,Yes,3,3,3,4,4,3,4,Yes,Vision,Just not interested! (Want to work in the corp...,0
2,Engineering Sciences,No,18,Male,Yes,No,3,4,3,3,3,4,4,No,Passion,Not willing to start a venture in India and wa...,0
3,Engineering Sciences,Yes,20,Male,Yes,Yes,3,3,3,3,4,3,3,No,Resilience,Not able to take a Financial Risk,0
4,Engineering Sciences,Yes,19,Male,Yes,Yes,2,3,3,3,4,3,2,Yes,Vision,,1


In [6]:
df['ReasonsForLack'].unique()

array([nan,
       'Just not interested! (Want to work in the corporate sector, or for the government or pursue research or something else)',
       'Not willing to start a venture in India and waiting for future relocation',
       'Not able to take a Financial Risk',
       'Just not interested! (Want to work in the corporate sector, or for the government or pursue research or something else), Academic Pressure, Lack of Knowledge, Not able to take a Financial Risk',
       'Academic Pressure, Lack of Knowledge, Not able to take a Financial Risk',
       'Just not interested! (Want to work in the corporate sector, or for the government or pursue research or something else), Academic Pressure, Parental Pressure',
       'Academic Pressure, Lack of Knowledge, Mental Block',
       'Lack of Knowledge',
       'Academic Pressure, Unwillingness to take risk, Lack of Knowledge',
       'Just not interested! (Want to work in the corporate sector, or for the government or pursue research or s

In [7]:
#Filling Null values with No reaso
df['ReasonsForLack'] = df['ReasonsForLack'].fillna('No Reason')

In [8]:
#Cleaning up the column for further preprocessing
df['ReasonsForLack'] = df['ReasonsForLack'].str.replace(',,',
                                 ',',regex=False)
df['ReasonsForLack'] = df['ReasonsForLack'].str.replace('interested, ',
                                                       'interested')

In [9]:
df['ReasonsForLack'].unique()

array(['No Reason',
       'Just not interested! (Want to work in the corporate sector, or for the government or pursue research or something else)',
       'Not willing to start a venture in India and waiting for future relocation',
       'Not able to take a Financial Risk',
       'Just not interested! (Want to work in the corporate sector, or for the government or pursue research or something else), Academic Pressure, Lack of Knowledge, Not able to take a Financial Risk',
       'Academic Pressure, Lack of Knowledge, Not able to take a Financial Risk',
       'Just not interested! (Want to work in the corporate sector, or for the government or pursue research or something else), Academic Pressure, Parental Pressure',
       'Academic Pressure, Lack of Knowledge, Mental Block',
       'Lack of Knowledge',
       'Academic Pressure, Unwillingness to take risk, Lack of Knowledge',
       'Just not interested! (Want to work in the corporate sector, or for the government or pursue resea

In [10]:
#Splitting with , delimeter to get the unique values of the column
df['ReasonsForLack'].str.split(', ', expand=True)[0].unique()

array(['No Reason',
       'Just not interested! (Want to work in the corporate sector',
       'Not willing to start a venture in India and waiting for future relocation',
       'Not able to take a Financial Risk', 'Academic Pressure',
       'Lack of Knowledge', 'Unwillingness to take risk',
       'Parental Pressure', 'Mental Block'], dtype=object)

In [11]:
df['ReasonsForLack'] = df['ReasonsForLack'].str.replace('Just not interested! (Want to work in the corporate sector, or for the government or pursue research or something else)',
                                 'Just not interested, ',regex=False)

In [12]:
#Making new columns with bool type data for each unique element in ReasonsForLack
df['No Reason'] = df['ReasonsForLack'].str.contains('No Reason')
df['Just not interested'] = df['ReasonsForLack'].str.contains('Just not interested')
df['waiting for future relocation'] = df['ReasonsForLack'].str.contains('waiting for future relocation')
df['Financial Risk'] = df['ReasonsForLack'].str.contains('Not able to take a Financial Risk')
df['Academic Pressure'] = df['ReasonsForLack'].str.contains('Academic Pressure')
df['Lack of Knowledge'] = df['ReasonsForLack'].str.contains('Lack of Knowledge')
df['Unwillingness to take risk'] = df['ReasonsForLack'].str.contains('Unwillingness to take risk')
df['Parental Pressure'] = df['ReasonsForLack'].str.contains('Parental Pressure')
df['Mental Block'] = df['ReasonsForLack'].str.contains('Mental Block')

In [13]:
#Original column not required, so dropped
df = df.drop(['ReasonsForLack'],axis=1)

In [None]:
Checking label imbalance
plt.figure(figsize=(20,10))
plt.title("Label Count")
sns.countplot(data=df,x='y')
plt.show()

In [15]:
X = df.drop(['y'],axis=1)
y = df['y']
X = pd.get_dummies(X,drop_first=True)

In [16]:
#Making imports and scaling data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()
train_X,test_X,train_y,test_y = train_test_split(X,y,train_size=0.8,random_state=0)
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [19]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=10000)
model.fit(train_X,train_y)
preds = model.predict(test_X)

In [24]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score
acc = accuracy_score(test_y,preds)
cmatrix = confusion_matrix(test_y,preds)
recall = recall_score(test_y,preds)
precision = precision_score(test_y,preds)
print(acc,cmatrix,recall,precision,sep='\n')

1.0
[[28  0]
 [ 0 16]]
1.0
1.0
