In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import pickle
import openai
import scipy.stats as stats
import os

# Creating PDF
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.enums import TA_CENTER

# Creating Word
from docx import Document
from docx.shared import Inches

# To add date to the title of PDF file
from datetime import datetime

df = pd.read_csv('HR_dataset.csv')
df.drop_duplicates(inplace=True)
df.reset_index(drop=True,inplace=True)
df.rename(columns={'Departments ':'departments'}, inplace = True)
df_statistical_test = df.drop(columns=['left','Work_accident','promotion_last_5years'])

In [2]:
def test_single_person(data: pd.DataFrame, department: str, column: str, person_value: float, alpha: float = 0.05):
    """
    Perform a one-sample t-test to compare a single person's value to the mean value for a specified department.

    Args:
        data (pd.DataFrame): The DataFrame containing the data to be tested.
        department (str): The name of the department to test (must match the value in the 'department' column).
        column (str): The name of the column containing the data to be tested.
        person_value (float): The value for the single person to be tested.
        alpha (float, optional): The significance level for the test (default is 0.05).

    Returns:
        A string indicating whether the single person's value is statistically different from the department mean.
    """
    department_data = data[data['departments'] == department][column]
    t_stat, p_value = stats.ttest_1samp(department_data, person_value)

    if p_value < alpha:
        return f"The single person's {column} ({person_value:.2f}) is statistically different from the {department} department mean ({department_data.mean():.2f}). The p-value is {p_value:.4f}."
    else:
        return f"The single person's {column} ({person_value:.2f}) is not statistically different from the {department} department mean ({department_data.mean():.2f}). The p-value is {p_value:.4f}."
    

def apply_test_single_person(data: pd.DataFrame, person_data: pd.DataFrame, alpha: float = 0.05):
    """
    Apply the test_single_person() function to all numeric columns in a given DataFrame for the single person's department.

    Args:
        data (pd.DataFrame): The DataFrame containing the data to be tested.
        person_data (pd.DataFrame): A DataFrame containing the single person's values for each column to be tested.
        alpha (float, optional): The significance level for the test (default is 0.05).

    Returns:
        A dictionary containing the question and result of the one-sample t-test for each numeric column.
    """
    results = {}
    numeric_cols = data.select_dtypes(include='number').columns
    person_dept = person_data.iloc[0]['departments']

    for col in numeric_cols:
        result = test_single_person(data=data, department=person_dept, column=col, person_value=person_data.iloc[0][col], alpha=alpha)
        question = f"Is the single person's {col} significantly different from the {person_dept} department mean?"
        results[f"{col} - {person_dept}"] = {'question': question, 'result': result}

    return results

In [5]:
df_sample = pd.DataFrame(df.iloc[0]).T
df_sample

# df = pd.DataFrame.from_dict([my_dict])

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,departments,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low


In [6]:
churn_sample = df_sample # load_trans.transform(df_sample)

In [7]:
apply_test_single_person(df.drop(columns=['left','Work_accident','promotion_last_5years']),churn_sample)

{'satisfaction_level - sales': {'question': "Is the single person's satisfaction_level significantly different from the sales department mean?",
  'result': "The single person's satisfaction_level (0.38) is statistically different from the sales department mean (0.63). The p-value is 0.0000."},
 'last_evaluation - sales': {'question': "Is the single person's last_evaluation significantly different from the sales department mean?",
  'result': "The single person's last_evaluation (0.53) is statistically different from the sales department mean (0.71). The p-value is 0.0000."},
 'number_project - sales': {'question': "Is the single person's number_project significantly different from the sales department mean?",
  'result': "The single person's number_project (2.00) is statistically different from the sales department mean (3.78). The p-value is 0.0000."},
 'average_montly_hours - sales': {'question': "Is the single person's average_montly_hours significantly different from the sales dep

In [8]:
dict = df[(df.departments == 'technical') & (df.left==1)][['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company']].mean().to_dict()

In [52]:
def calculate_department_stats(data, sample_df, left=None):
    """
    Calculates the mean values of several employee performance metrics for a specific department in a dataframe.
    
    Args:
        data (pandas.DataFrame): The dataframe containing employee data. 
        sample_df (pandas.DataFrame): A separate dataframe containing information about the department of interest.
        left (bool or None): If left is None, calculate stats for all employees in the department (both left and not left).
                             If left is True, calculate stats only for employees who have left the company.
                             If left is False, calculate stats only for employees who have not yet left the company.
                             
    Returns:
        dict: A dictionary containing the mean values of the following employee performance metrics for the specified department: 
            - satisfaction_level     
            - last_evaluation
            - number_project       
            - average_montly_hours
            - time_spend_company
    """
    
    filtered_df = np.nan
    department = sample_df.departments.iloc[0]
    if left is None:
        filtered_df = data[data.departments == department]   
    else:
        filtered_df = data[(data.departments == department) & (data.left == int(left))]
        
    metrics = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company']
    stats_dict = filtered_df[metrics].mean().to_dict()
        
    return stats_dict

def explain_department_stats(stats_dict, department_name, left=None):
    """
    Generates a string explaining the meaning of the values in a dictionary of department statistics.

    Args:
        stats_dict (dict): A dictionary containing the mean values of several employee performance metrics for a department.
        department_name (str): The name of the department the stats_dict corresponds to.
        left (bool or None): If left is None, generate an explanation for all employees in the department (both left and not left).
                             If left is True, generate an explanation only for employees who have left the company.
                             If left is False, generate an explanation only for employees who have not yet left the company.

    Returns:
        str: A string explaining the meaning of the values in the stats_dict dictionary.
    """
    if left is None:
        explanation = f"These are the mean values for the {department_name} department:"
    elif left:
        explanation = f"These are the mean values for employees who is churn of the {department_name} department:"
    else:
        explanation = f"These are the mean values for employees who is not churn of the {department_name} department:"
    for metric, value in stats_dict.items():
        explanation += f" {metric.replace('_', ' ')}: {value:.2f}. "
    # explanation += f"The employee is from this department."
    return explanation

In [60]:
filtered_df = df[(df.departments == 'sales') & (df.left == int(1))]
metrics = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company']
stats_dict = filtered_df[metrics].mean().to_dict()
stats_dict

{'satisfaction_level': 0.45039999999999997,
 'last_evaluation': 0.7162363636363637,
 'number_project': 3.7636363636363637,
 'average_montly_hours': 206.2709090909091,
 'time_spend_company': 3.82}

In [67]:
explain_department_stats(calculate_department_stats(df,churn_sample,0),'sales',0)

'These are the mean values for employees who is not churn of the sales department: satisfaction level: 0.67.  last evaluation: 0.71.  number project: 3.78.  average montly hours: 199.01.  time spend company: 3.29. '

In [20]:
statistical_findings = [
  "There is a significant difference in average values between employees who left and those who stayed for column satisfaction_level.",
  "There is no significant difference in average values between employees who left and those who stayed for column last_evaluation.",
  "There is no significant difference in average values between employees who left and those who stayed for column number_project.",
  "There is a significant difference in average values between employees who left and those who stayed for column average_montly_hours.",
  "There is a significant difference in average values between employees who left and those who stayed for column time_spend_company.",
  "There is a significant difference in average values between employees who left and those who stayed for column Work_accident.",
  "There is a significant difference in average values between employees who left and those who stayed for column promotion_last_5years.",
  "There is a significant difference in average values between employees who left and those who stayed for column left.",
  "There is evidence to suggest a significant difference in the proportion of employees who left the company based on whether they had a work accident or not.",
  "There is a significant difference in the average satisfaction level between employees who had a work accident and those who didn't.",
  "There is a statistically significant association between the salary level of employees and the likelihood of them leaving the company."
]

In [27]:
show_df = {'Informations':{'Departments': 'Sales',
        'Salary': 'Low',
        'Satisfaction Level': 0.09,
        'Last Evaluation': 0.79,
        'Assigned Project': 6,
        'Monthly Working Time': 293,
        'Time in the Company': 5,
        'Work Accident': True,
        'Get Promoted': True}}

In [12]:
result = 1

In [16]:
result_proba = (0.05,0.95)

In [None]:
churn_sample.departments.iloc[0]

'technical'

In [18]:
model_df = churn_sample

In [68]:
leave_text = ''
if result == 1:
    leave_text = f'This employee is churn according to ml model with {result_proba[1]} score'
    department_info = explain_department_stats(calculate_department_stats(df,model_df,1),model_df.departments.iloc[0],1)
    department_info += explain_department_stats(calculate_department_stats(df,model_df,0),model_df.departments.iloc[0],0)
    department_info += explain_department_stats(calculate_department_stats(df,model_df),model_df.departments.iloc[0])
else:
    leave_text = f'This employee is not churn according to ml model with {result_proba[0]} score'
    department_info = explain_department_stats(calculate_department_stats(df,model_df,1),model_df.departments.iloc[0],1)
    department_info += explain_department_stats(calculate_department_stats(df,model_df,0),model_df.departments.iloc[0],0)
    department_info += explain_department_stats(calculate_department_stats(df,model_df),model_df.departments.iloc[0])

show_df['Informations']['Monthly Working Time'] = str(show_df['Informations']['Monthly Working Time'])  + ' hours'
message = f"How can I increase the productivity of this employee? Employee information: {show_df}. {leave_text}. These are statistical test results based on hypothesis tests:{' '.join(statistical_findings)} {department_info} Consider employee information and evaluate each information. Also comment on churn with the ML score rounded. Write engaging conclusion."
        

In [69]:
message

"How can I increase the productivity of this employee? Employee information: {'Informations': {'Departments': 'Sales', 'Salary': 'Low', 'Satisfaction Level': 0.09, 'Last Evaluation': 0.79, 'Assigned Project': 6, 'Monthly Working Time': '293 hours hours hours hours hours hours hours hours hours', 'Time in the Company': 5, 'Work Accident': True, 'Get Promoted': True}}. This employee is churn according to ml model with 0.95 score. These are statistical test results based on hypothesis tests:There is a significant difference in average values between employees who left and those who stayed for column satisfaction_level. There is no significant difference in average values between employees who left and those who stayed for column last_evaluation. There is no significant difference in average values between employees who left and those who stayed for column number_project. There is a significant difference in average values between employees who left and those who stayed for column average_

In [None]:
# !pip install reportlab


Collecting reportlab
  Downloading reportlab-4.0.4-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Installing collected packages: reportlab
Successfully installed reportlab-4.0.4


In [None]:
# !pip install python-docx

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25ldone
[?25h  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184491 sha256=29c794a04d5311bf3249c8d8c69af7383f8d0a09184ccadb3e74109fd194374d
  Stored in directory: /Users/mac/Library/Caches/pip/wheels/80/27/06/837436d4c3bd989b957a91679966f207bfd71d358d63a8194d
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11
