Task_a1: Import the dataset in a notebook environment with python library : “Pandas”

In [59]:
import pandas as pd
dataframe = pd.read_csv("dataset.csv")

Task_a2: Show the number of attributes (columns) and number of records (rows)

In [None]:
# Here the first value denotes number of rows and second value denotes the number of attributes
dataframe.shape

In [None]:
# Printing the name of the columns
dataframe.columns

Task_a3: Show the statistics of the dataset ( column wise mean, standard deviation, max,
min etc)

In [None]:
# It shows 8 different statistics of 26 numerical columns
dataframe.describe()

Task_a4: Count the number of missing values in the dataset

In [None]:
# column wise missing value counts
dataframe.isnull().sum()

Task_a5: Count the number of duplicate values in the dataset.

In [None]:
# It shows number of redundant rows
dataframe.duplicated().sum()

In [None]:
# which rows are duplicated, the phrase in third bracket returns the indices where duplicates occur
dataframe[dataframe.duplicated(keep=False)]

Task_b1: If you find any missing values in the dataset ( nan values) replace those data with
the column wise mean.

In [66]:
# Replacing any null attributes with the mean of that attribute column
dataframe.fillna(dataframe.mean(numeric_only=True), inplace=True)

In [None]:
# to validate that we now do not have any Attributes(numerical) null
dataframe.isnull().sum()
# We will not deal with the string attributes as our target variable is of type string and we have to omit the rows with null values


Task_b2: If you find any duplicates in the dataset, keep just one copy of the data

In [68]:
dataframe.drop_duplicates(inplace=True)

Task_b3: If any row in the target column (Attrition) is missing, you must drop that row


In [69]:
# Dropping the row with no target labels
dataframe.dropna(subset=['Attrition'], inplace=True)

In [None]:
# to validate that we have no null values in target attribute
dataframe.isnull().sum()

In [None]:
# now replacing the null values of type string with the mode of that column
dataframe['Department'].fillna(dataframe['Department'].mode()[0])

In [None]:
# to validate that we have no null values in the full dataset
dataframe.isnull().sum()

Task_3a: You need to split the data into two parts. The “Features” variable will consist of all
the columns in the dataset except the target column. And the “Labels” variable
will contain only the column. 

In [None]:
Features = dataframe.drop('Attrition', axis=1)
Features

In [None]:
Labels = dataframe['Attrition']
Labels

Task_d2: For doing that, you need to first convert such columns which are not numeric
types, into categorical types. Then you need to perform one hot encoding on that
column, which will divide that column into multiple one hot type column.

In [None]:
from sklearn.preprocessing import LabelEncoder
categorical_columns = []
for col in Features.columns:
	if (not pd.api.types.is_numeric_dtype(Features[col])):
		if (Features[col].nunique() >= 3):
			categorical_columns.append(col)
		else:
			encoder = LabelEncoder()
			dummy_col = Features[col]
			dummy_col = encoder.fit_transform(dummy_col)
			dummy_col_df = pd.DataFrame(dummy_col, columns=[col])
			Features[col] = dummy_col_df
			print(col)


categorical_columns

In [None]:
Features


In [77]:
# Convert each column to categorical type
for col in categorical_columns:
    Features[col] = Features[col].astype('category')
# One-hot encode the data using pandas get_dummies
Features = pd.get_dummies(Features, dtype=int)

In [None]:
labels_col = encoder.fit_transform(Labels)
labels_col

In [None]:
Labels_df = pd.DataFrame(labels_col, columns=['Attrition'])
Labels_df

In [None]:
Features

In [None]:
Features.dtypes

Task_e: Scaling of the features

In [82]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def scaling(dataframe, preference):
	one_hot_columns=[]
	for col in dataframe.columns:
		for name in categorical_columns:
			now = str(name)+"_"
			if (now in col):
				one_hot_columns.append(col)
				#print(col)
				break
	
	other_columns = []
	for col in dataframe.columns:
		if col not in one_hot_columns:
			other_columns.append(col)

	if preference == 1:
		scaler = MinMaxScaler()
		features_minmax = scaler.fit_transform(dataframe[other_columns])
		features_df = pd.DataFrame(features_minmax, columns=other_columns) # scaled feature dataframe
		dataframe[other_columns] = features_df[other_columns]
		return dataframe
	else:
		scaler = StandardScaler()
		features_minmax = scaler.fit_transform(dataframe[other_columns])
		features_df = pd.DataFrame(features_minmax, columns=other_columns) # scaled feature dataframe
		dataframe[other_columns] = features_df[other_columns]
		return dataframe


In [None]:
Features_1 = scaling(Features,1)
Features_1

In [None]:
Features_2 = scaling(Features,2)
Features_2

Task_e: Correlation Analysis

In [None]:
# correlation analysis of features with target
zero_var_columns=[]
for col in Features_1.columns:
	variance = Features_1[col].var()
	if variance == 0:
		print(col)
		zero_var_columns.append(col)

zero_var_columns
target_labels = Labels_df['Attrition']
Features_1_cleaned = Features_1.drop(columns=zero_var_columns)
correlations = Features_1_cleaned.corrwith(target_labels)
correlations

In [None]:
sorted_correlations = correlations.abs().sort_values(ascending=False)
top_20_columns = sorted_correlations.head(20).index
top_20_features = Features_1_cleaned[top_20_columns]
top_20_columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
top_20_features_matrix = top_20_features.corr()
plt.figure(figsize=(12,10))
sns.heatmap(top_20_features_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt='.2f')
plt.title('Correlation Heatmap of Top 20 Features')
plt.show()



Plotting

In [None]:
target_vals = Labels_df['Attrition'].unique()
print(target_vals)
class_dict={}
for val in target_vals:
	print(val)
	class_dict[val]=Features_1_cleaned.loc[Labels_df['Attrition']==val]

print(class_dict[1])


In [None]:
# Create a 1D scatter plot for sepal_width with numeric labels
import matplotlib.pyplot as plt
import numpy as np

for col in top_20_columns:
	plt.plot(class_dict[0][col], np.zeros_like(class_dict[0][col]), 'o', label=f'Class {0}')
	plt.plot(class_dict[1][col], np.zeros_like(class_dict[1][col]), 'o', label=f'Class {1}')
	plt.legend()
	plt.xlabel(col)
	plt.title(f'1D Scatter Plot of {col} by Numeric Classes')
	plt.savefig(f"{col}.png")
	plt.show()


In [None]:
# Import necessary libraries
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = top_20_features
y = Labels_df['Attrition']

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)
# Step 2: Initialize the Logistic Regression classifier
clf = LogisticRegression()
# Step 3: Train the classifier on the training data
clf.fit(X_train, y_train)
# Step 4: Make predictions on the test set
y_pred = clf.predict(X_test)
# Step 5: Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Logistic Regression classifier: {accuracy:.2f}")