In [1]:
import pandas as pd
import statistics as stat
from scipy.stats import ttest_ind

# Spark Fortress Inc. HR data
On the basis of HR data, the following hypotheses should be tested:
- Is there a gender bias in Spark Fortress?

## Data cleaning

In [11]:
data = pd.read_csv("data-sets/hr-data.csv", sep=";")
data.columns = [col.strip() for col in data.columns]
data.head(10)

Unnamed: 0,Surname,Name,Age,Gender,Country,Ethnicity,Start_date,Department,Position,Salary
0,Sweetwater,Alex,51,Male,United States,White,2011.08.15,Software Engineering,Software Engineering Manager,"$56 160,00"
1,Carabbio,Judith,30,Female,United States,White,2013.11.11,Software Engineering,Software Engineer,"$116 480,00"
2,Saada,Adell,31,Female,United States,White,2012.11.05,Software Engineering,Software Engineer,"$102 440,00"
3,Szabo,Andrew,34,Male,United States,White,2014.07.07,Software Engineering,Software Engineer,"$99 840,00"
4,Andreola,Colby,38,Female,United States,White,2014.11.10,Software Engineering,Software Engineer,"$99 008,00"
5,Daneault,Lynn,27,Female,United States,White,2014.05.05,Sales,Sales Manager,"$112 320,00"
6,Houlihan,Debra,51,Female,United States,White,2014.05.05,Sales,Director of Sales,"$124 800,00"
7,Onque,Jasmine,27,Female,United States,White,2013.09.30,Sales,Area Sales Manager,"$118 560,00"
8,Jeremy,Peter,43,Male,United States,White,2014.05.12,Sales,Area Sales Manager,"$116 480,00"
9,Gonzales,Ricardo,63,Male,United States,White,2014.05.12,Sales,Area Sales Manager,"$115 440,00"


In [3]:
# Create a copy and format columns
convert_data = data.copy()

def convert_accounting_to_float(column_names, dataframe):
    for column_name in column_names:
        dataframe[column_name] = dataframe[column_name].str.replace("$", "", regex=False).str.replace(",", ".", regex=True)
        dataframe[column_name] = dataframe[column_name].apply(lambda x: float("".join(x.split())))
    return dataframe

convert_data["Age"] = convert_data["Age"].astype(int)

columns_to_float = ["Salary"]

convert_data = convert_accounting_to_float(columns_to_float, convert_data)

convert_data.head(10)

Unnamed: 0,Surname,Name,Age,Gender,Country,Ethnicity,Start_date,Department,Position,Salary
0,Sweetwater,Alex,51,Male,United States,White,2011.08.15,Software Engineering,Software Engineering Manager,56160.0
1,Carabbio,Judith,30,Female,United States,White,2013.11.11,Software Engineering,Software Engineer,116480.0
2,Saada,Adell,31,Female,United States,White,2012.11.05,Software Engineering,Software Engineer,102440.0
3,Szabo,Andrew,34,Male,United States,White,2014.07.07,Software Engineering,Software Engineer,99840.0
4,Andreola,Colby,38,Female,United States,White,2014.11.10,Software Engineering,Software Engineer,99008.0
5,Daneault,Lynn,27,Female,United States,White,2014.05.05,Sales,Sales Manager,112320.0
6,Houlihan,Debra,51,Female,United States,White,2014.05.05,Sales,Director of Sales,124800.0
7,Onque,Jasmine,27,Female,United States,White,2013.09.30,Sales,Area Sales Manager,118560.0
8,Jeremy,Peter,43,Male,United States,White,2014.05.12,Sales,Area Sales Manager,116480.0
9,Gonzales,Ricardo,63,Male,United States,White,2014.05.12,Sales,Area Sales Manager,115440.0


## Hypotesis testing: Gender Bias
- H0 = μm - μf = 0
- H1 = μm - μf ≠ 0

In [4]:
# Create frequency table for male and female employees
def generate_frequency_table(column_name, dataframe):
    frequency_series = dataframe[column_name].value_counts()
    
    frequency_table = pd.DataFrame({"Frequency": frequency_series})
    
    return frequency_table


freq_table_gender = generate_frequency_table("Gender", convert_data)

freq_table_gender

Unnamed: 0,Frequency
Female,98
Male,76


In [5]:
# Add aggregated columns
mean_salary_by_gender = convert_data.groupby("Gender")["Salary"].mean().round(2)
variance_salary_by_gender = convert_data.groupby("Gender")["Salary"].var(ddof=1).round(2)

freq_table_gender["Salary mean [$]"] = mean_salary_by_gender
freq_table_gender["Sample variance [$^2]"] = variance_salary_by_gender


freq_table_gender

Unnamed: 0,Frequency,Salary mean [$],Sample variance [$^2]
Female,98,65736.91,1097618000.0
Male,76,72300.53,1241432000.0


In [6]:
# Calculate t-score and p-value (independent)
salaries_male = convert_data[convert_data["Gender"] == "Male"]["Salary"]
salaries_female = convert_data[convert_data["Gender"] == "Female"]["Salary"]

t_score, p_value = ttest_ind(salaries_male, salaries_female)

p_value_one_sided = p_value / 2

print("T-score:", t_score.round(3))
print("P-value:", p_value_one_sided.round(3))

T-score: 1.261
P-value: 0.105


## Conclusion of the hypothesis test: Gender Bias
Based on the T-score and the calculated P-value, there isn't enough evidence to reject the null hypothesis. Consequently, the analysis does not provide strong statistical support to claim a significant difference in mean salaries between males and females in the dataset.

## Extra: More detailed breakdown by age group
In order to get a more detailed picture of the salaries, the data will be extracted into two age groups: under 35 and over 34 years.

In [7]:
male_under35 = convert_data[(convert_data["Gender"] == "Male") & (convert_data["Age"] < 36)]
female_under35 = convert_data[(convert_data["Gender"] == "Female") & (convert_data["Age"] < 36)]

male_over34 = convert_data[(convert_data["Gender"] == "Male") & (convert_data["Age"] > 36)]
female_over34 = convert_data[(convert_data["Gender"] == "Female") & (convert_data["Age"] > 36)]

data_under35 = pd.concat([male_under35, female_under35])
data_over34 = pd.concat([male_over34, female_over34])

In [8]:
freq_table_gender_under35 = generate_frequency_table("Gender", data_under35)

mean_salary_by_gender_under35 = data_under35.groupby("Gender")["Salary"].mean().round(2)
variance_salary_by_gender_under35 = data_under35.groupby("Gender")["Salary"].var(ddof=1).round(2)

freq_table_gender_under35["Salary mean [$]"] = mean_salary_by_gender_under35
freq_table_gender_under35["Sample variance [$^2]"] = variance_salary_by_gender_under35

freq_table_gender_under35

Unnamed: 0,Frequency,Salary mean [$],Sample variance [$^2]
Female,46,66775.23,1063145000.0
Male,36,68622.67,1001780000.0


In [9]:
freq_table_gender_over34 = generate_frequency_table("Gender", data_over34)

mean_salary_by_gender_over34 = data_over34.groupby("Gender")["Salary"].mean().round(2)
variance_salary_by_gender_over34 = data_over34.groupby("Gender")["Salary"].var(ddof=1).round(2)

freq_table_gender_over34["Salary mean [$]"] = mean_salary_by_gender_over34
freq_table_gender_over34["Sample variance [$^2]"] = variance_salary_by_gender_over34

freq_table_gender_over34

Unnamed: 0,Frequency,Salary mean [$],Sample variance [$^2]
Female,47,62661.55,1173474000.0
Male,37,74770.38,1470190000.0


In [10]:
t_score_under35, p_value_under35 = ttest_ind(male_under35["Salary"], female_under35["Salary"])
t_score_over34, p_value_over34 = ttest_ind(male_over34["Salary"], female_over34["Salary"])

p_value_one_sided_under35 = p_value_under35 / 2
p_value_one_sided_over34 = p_value_over34 / 2

print("T-score under 35:", t_score_under35.round(3))
print("P-value under 35:", p_value_one_sided_under35.round(3))

print("T-score over 34:", t_score_over34.round(3))
print("P-value over 34:", p_value_one_sided_over34.round(3))

T-score under 35: 0.258
P-value under 35: 0.399
T-score over 34: 1.526
P-value over 34: 0.065


## Conclusion of the hypothesis test: Gender Bias under 35 and over 34 years
Based on the T-scores and the calculated P-values:
- Under 35 years: No wage gap on a gender basis.
- Over 34 years: At a significance level of 0.05 we can not reject the null-hypotesis, however because the p-value is very close to it, we cannot rule out that there is a basis for the claim that there might be a difference in mean salaries between genders over 34.