In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
df = pd.read_csv('./HR_comma_sep.csv.crdownload',encoding='utf-8')

In [None]:
df.head()

In [None]:
#The satifaction_level defines the satisfaction of the employee to work in the company
# The number of project define the projects done by him/her
# time_spend_company determines the time spent in the company

# Clean data

In [None]:
df.duplicated().sum()

In [None]:
# we have 9 duplicates so we need to drop it

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
cols=df.columns.tolist()

In [None]:
df.columns= [x.lower() for x in cols]
df.columns

In [None]:
# writing the functions for the above code

def reading_cleaning(df):
    df.drop_duplicates(inplace=True)
    cols = df.columns.tolist()
    df.columns = [x.lower() for x in cols]

    return df
df=reading_cleaning(df)
df.head()

# Evaluating Employee Details

In [None]:
df['satisfaction_level'].mean()   # To get the mean of full statisfaction level
df['department'].value_counts() # To get the different departments


In [None]:
df.groupby('department')['satisfaction_level'].mean()    #To group the employees based on the satisfaction and calculation  of mean for each group

In [None]:
df.groupby('salary')['satisfaction_level'].mean()  # To group the employees based on the  salary and satisfaction level mean of each employee

In [None]:
df['left'].value_counts()   # 0<--employee not left   #1<--employee left

In [None]:
len(df[df['left']==0])           #The total employee that has not left the compony

In [None]:
len(df[df['left']==1])           # The total employee that has left the company

In [None]:
# Now automating the above task with the help of the function

In [None]:
def employee_important_info(df):
    # Average satisfaction level
    average_satisfaction = df['satisfaction_level'].mean()
    #Department wise average satifaction level
    department_satisfaction = df.groupby('department')['satisfaction_level'].mean()
    # salary wise average satisfaction level
    salary_satisfaction = df.groupby('salary')['satisfaction_level'].mean()

    # Employee who left
    left_employees = len(df[df['left']==1])
    # Employee who stayed
    stayed_employees = len(df[df['left']==0])

    return average_satisfaction,department_satisfaction,salary_satisfaction,left_employees,stayed_employees

    

In [None]:
average_satisfaction,department_satisfaction,salary_satisfaction,left_employees,stayed_employees = employee_important_info(df)

In [None]:
print("Average Satisfaction level:",average_satisfaction)
print("Department-wise Average Satisfaction level:\n",department_satisfaction)
print("Salary-wise Average Satisfaction Level :\n",salary_satisfaction)
print("Employee who stayed :",stayed_employees)

# Pie Plot for employee different features

In [None]:
def plots(df,col):
    values = df[col].unique()
    plt.figure(figsize=(15,8))
    
    explode = [0.1 if len(values) >1 else 0] * len(values)
    plt.pie(df[col].value_counts(),explode=explode,startangle=40,autopct='%1.1f%%',shadow=True)
    labels = [f'{value} ({col})' for value in values]
    plt.legend(labels=labels,loc='upper right')
    
    plt.title(f"distribution of {col}")
    plt.show()
    
plots(df,'left')

In [None]:
plots(df,'salary')

In [None]:
plots(df,'number_project')

In [None]:
plots(df,'department')

# Employee distribution

In [None]:
# we will use seaborn library 
sns.countplot(x=df['salary'],hue='left',data=df)   #so hue defines the  column reference and salary is the first column  to plot. For low more employee left than remain

In [None]:
def distribution(df,col):
    values = df[col].unique()
    plt.figure(figsize=(15,8))
    sns.countplot(x=df[col],hue='left',palette='Set1',data=df)
    labels = [f"{val} ({col})" for val in values]
    plt.legend(labels=labels,loc="upper right")
    plt.title(f"distribution of {col}")
    plt.xticks(rotation=90)
    plt.show()
distribution(df,'salary')

In [None]:
distribution(df,'department')

In [None]:
# comparison of each department individual based on the satisfaction level
def comparison(df,x,y):
    plt.figure(figsize=(15,8))
    sns.barplot(x=x,y=y,hue='left',data=df,errorbar=None)
    plt.title(f'{x} vs {y}')
    plt.show()
comparison(df,'department', 'satisfaction_level')

In [None]:
def corr_with_left(df):
    df_encoded = pd.get_dummies(df)
    correlations = df_encoded.corr()['left'].sort_values()[:-1]
    colors = ['skyblue' if corr>=0 else 'salmon' for corr in correlations]
    plt.figure(figsize=(10,8))
    correlations.plot(kind='barh', color=colors)
    # Add title and labels
    plt.title('Correlation with Left')
    plt.xlabel('Correlation')
    plt.ylabel('Features')

    # Show the plot
    plt.show()
corr_with_left(df)

# Employee Churn Analysis

In [None]:
# '''The code generates a subplot with two histograms for visualization purposes.The first histogram displays the distribution of the columns especified by
# the variable col in the dataframe df,with bins set to 20.The bars are coloured according to the 'left column'. The second histogram is a kernel density 
# estimation plot showing the relationship between satisfaction_level and last_evaluation also coloured by the left_coloumn 
# .Finally plt.tight_layout ensures that the plots are properly arranged and displayed withour overlapping.'''

In [None]:
def histogram(df, col):
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))  # Create a grid of 1 row and 2 columns

    # Plot the first histogram for column1
    sns.histplot(data=df, x=col, hue='left', bins=20, ax=axes[0])
    axes[0].set_title(f"Histogram of {col}")

    # Plot the second histogram for column2
    sns.kdeplot(data=df, x='satisfaction_level', y='last_evaluation', hue='left', fill=True, ax=axes[1])
    axes[1].set_title("Kernel Density Estimation")

    plt.tight_layout()  # Adjust the layout to prevent overlapping
    plt.show()

histogram(df, 'satisfaction_level')

In [None]:
# K-Means