# Preppin' Data Challenge -
## 2023: Week 32 - HR Month - Reshaping Generations
## Created by: Ghafar Shah

### Challenge: https://preppindata.blogspot.com/2023/08/2023-week-32-hr-month-reshaping.html

Now that all of the missing IDs have been filled in from Week 31 exercise, HR would like us to add several age groupings to the dataset for reporting, so the employee’s exact date of birth can be removed from the dataset. Pew definitions was used each generation.

### Import required packages for data analysis

In [None]:
import pandas as pd
import numpy as np

### Import generations table based on Pew definitions

In [None]:
# read in generations info
generations = pd.read_csv('generations.csv')

# preview dataframe
generations

### Convert start_year and end_year columns in generations table to whole numbers

In [None]:
# Replace any NaNs with zeros so the conversion to numeric works 
generations['start_year'] = generations['start_year'].fillna(0).astype(int)
generations['end_year'] = generations['end_year'].fillna(0).astype(int)

#preview dataframe
generations

### Import the ee_monthly file (ee_monthly)

In [None]:
ee_monthly_roster = pd.read_csv('Final_Monthly_EE_Roster.csv')
ee_monthly_roster

### Import the employee demographics file (ee_dim)

In [None]:
ee_dim = pd.read_csv('Final_Employee_Dimensions_Roster.csv')
ee_dim

### Add a new column, generation_name, which includes the generation name (e.g., Generation X (1965-1980))
- If the generation doesn’t have a start year, the text should be “(born in or before End Year)”
- If the generation doesn’t have an end year, the text should be “(born in or after Start Year)”

In [None]:
"""

Using .apply() to iterate over each row of the DataFrame and apply the lambda function to it
And, the lambda function takes a single argument, called "row", which is a record from the DataFrame
    
"""

 # Check if there are any values equal to 0 in the 'start_year' column of the 'generations' DataFrame.
if (generations['start_year'] == 0).any():
    
    generations['generations'] = generations.apply(lambda row: 
    'born in or before ' + str(row['end_year']) if row['start_year'] == 0 else 
    str(row['generation']) + ' (' + str(row['start_year']) + '-' + str(row['end_year']) + ')', axis=1)

 # Check if there are any values equal to 0 in the 'end_year' column of the 'generations' DataFrame.
if (generations['end_year'] == 0).any():
    
    generations.loc[generations['end_year'].isna(), 'generations'] = generations.apply(lambda row: 
    'born in or after ' + str(row['start_year']) if row['end_year'] == 0 else 
    str(row['generation']) + ' (' + str(row['start_year']) + '-' + str(row['end_year']) + ')', axis=1)

In [None]:
# Preview updated generations DataFrame
generations.head()

### Calculate the employee’s birth year from the date_of_birth

In [None]:
# Preserve the year after last slash "/"
ee_dim = ee_dim.assign(Years=ee_dim.date_of_birth.str[-4:])

# Create a Birth_Year numeric column and convert to whole number (remove decimal)
ee_dim['Birth_Year'] = pd.to_numeric(ee_dim['Years'], errors='coerce').fillna(0).astype(int)

ee_dim

### Check how many Birth Years have value of 0 
Remember, we originally changed the NaN (isna) to replace it with zero instead due to whole number conversion

In [None]:
# Checks number of 0s in birth year column from ee_dim table
num_zeros = (ee_dim['Birth_Year'] == 0).sum()
num_zeros

In [None]:
# Check ee_dim table data types
ee_dim.dtypes

In [None]:
# Check generations table data types
generations.dtypes

###  Finds the generation label based on Birth_Year in employee demographics table

In [None]:
# The find_generation function finds the generation label based on Birth_Year
def find_generation(birth_year):
    
    # Iterating through each row
    for index, row in generations.iterrows():
        
        # Check if the birth year falls within the start_year and end_year range
        if row['start_year'] <= birth_year <= row['end_year']:
            
            # If it does, then lets return the corresponding generation name
            return row['generations']
    
    # If no matching generation name is found, then return Not Provided
    return 'Not Provided'

In [None]:
# Apply the find_generation function to each row in ee_dim DataFrame
ee_dim['generation_name'] = ee_dim['Birth_Year'].apply(find_generation)

# Preview updated ee_dim DataFrame
ee_dim

In [None]:
# Check the ee_dim table record count
ee_dim.count()

### Join the monthly roster to the employee demographics data (ee_dim) on employee_id

In [None]:
# Join monthly employee table to ee_dim table on column employee_id
monthly_ee_df = ee_monthly_roster.merge(ee_dim , on='employee_id',how='left')

In [None]:
# Preview monthly_ee_df dataframe
monthly_ee_df

In [None]:
# Check datatype in monthly_ee_df dataframe
monthly_ee_df.dtypes

### Format the dates so that we can use it later to calculate an employee’s age (in full years)
We need to formate the dates in the monthly_ee_df dataframe to date datatype

In [None]:
# Convert the date_string column to datetime format
monthly_ee_df['Date_Of_Birth'] = pd.to_datetime(monthly_ee_df['date_of_birth'], format='%d/%m/%Y')
monthly_ee_df['Month_End_Date'] = pd.to_datetime(monthly_ee_df['month_end_date'], format='%d/%m/%Y')

In [None]:
#Preview the table
monthly_ee_df

### Calculate the employee’s age (in full years) as of the month_end_date
Now, we're calculating the employee's age (in full years) after converting the dates format to date datatype

In [None]:
# Calculate full year age based on month_end_date
age_in_years = ((monthly_ee_df['Month_End_Date'] - monthly_ee_df['Date_Of_Birth']).dt.days / 365)
monthly_ee_df['age'] = age_in_years.fillna(0).astype(int)

In [None]:
# Preview DataFrame
monthly_ee_df

### Calculate the employee’s age range, in 5-year increments, and name that column age_range

### Criteria: 
- Employees under 20 should be grouped into “Under 20 years”
- Employees between 20 and 69 should be grouped into 5-year increments (“20-24 years”, “25-29 years”, etc.)
- Employees 70 and over should be grouped into “70+ years”
- If the employee’s birth date is missing, the age_range should be “Not provided"

In [None]:
# First, we'll setup the age increments and labels
age_increments = [20, 25, 30, 35, 40, 45, 50, 55, 60, 65]
age_labels = ['20-24 years', '25-29 years', '30-34 years', '35-39 years', 
              '40-44 years', '45-49 years', '50-54 years', '55-59 years', '60-64 years', '65-69 years']

In [None]:
"""

    This function will be used to generate the age range for each employee:

    NOTES: 
    Specific age groups are handled first before the for loop is reached.
    The elif statement for ages less than 20 and greater than or equal to 70 
    will take precedence over the for loop below, so the for loop will only be executed 
    for ages that are greater than or equal to 20 and less than 70.

"""

def assign_age_group(row):
    
    if pd.isnull(row['age']):
        return 'Not Provided'
    
    elif row['age'] < 20:
        return 'Under 20 years'
    
    elif row['age'] >= 70:
        return '70+ years'

    # Here, when the age is less than the current age increment, 
    # it will return the corresponding age group the employee belongs to
    for age_increment, age_label in zip(age_increments, age_labels):
        if row['age'] < age_increment:
            return age_label

In [None]:
# Apply the function to create a new column 'age_range'
monthly_ee_df['age_range'] = monthly_ee_df.apply(assign_age_group, axis=1)

In [None]:
# Preview monthly ee_df results
monthly_ee_df

### Clean up the DataFrames

In [None]:
# rename original employee_id_x back to employee_id in the main table
monthly_ee_df.rename(columns = {'guid_x':'guid'}, inplace = True)
monthly_ee_df.rename(columns = {'leave_date_x':'leave_date'}, inplace = True)
monthly_ee_df.rename(columns = {'hire_date_x':'hire_date'}, inplace = True)

In [None]:
# Select specific fields from monthly roster
ee_monthly_v3 = monthly_ee_df[['employee_id', 'age_range', 'guid', 'dc_nbr', 'month_end_date', 'hire_date', 'leave_date']]

# Preview the final employee monthly roster dataframe
ee_monthly_v3.head()

In [None]:
# Select specific fields from employee demographics table
ee_dim_v3 = ee_dim[['employee_id', 'guid' ,'first_name', 'last_name', 'generation_name', 'nationality', 'gender', 'email', 'hire_date', 'leave_date']]

# Preview the final employee demographics dataframe
ee_dim_v3.head()

### Export dataframes to CSV
Uncomment code to export the data

In [None]:
# Export final dataframes to CSV

#ee_monthly_v3.to_csv('ee_monthly_v3.csv')
#ee_dim_v3.to_csv('ee_dim_v3.csv')

# Practicing Data Viz

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Select specific fields from monthly roster
ee_monthly_v4 = monthly_ee_df[['employee_id', 'age_range', 'month_end_date']]

# Preview the dataframe
ee_monthly_v4.head()

In [None]:
# Preserve the year after last slash "/"
ee_monthly_v4 = ee_monthly_v4.assign(Years=ee_monthly_v4.month_end_date.str[-4:])

# Create a Birth_Year numeric column and convert to whole number (remove decimal)
ee_monthly_v4['Year'] = pd.to_numeric(ee_monthly_v4['Years'], errors='coerce').fillna(0).astype(int)

ee_monthly_v4

In [None]:
# Drop redundant Years column
ee_monthly_v4 = ee_monthly_v4.drop(columns=['Years'])
ee_monthly_v4

In [None]:
# removes duplicates based on all columns
dataviz_monthly = ee_monthly_v4.drop_duplicates(subset=['employee_id'])
dataviz_monthly

### Setup the data and chart layout to buld the visualization
Filtered to only four age ranges for plotting

In [None]:
# Lets first filter the age_range labels for specific age ranges
filter_age_ranges = ['Under 20 years', '20-24 years', '25-29 years', '30-34 years']
df_filtered = dataviz_monthly[dataviz_monthly['age_range'].isin(filter_age_ranges)]

# Next, we will group the filtered data by year and age range fields
custom_group = df_filtered.groupby(['Year', 'age_range']).size().unstack()

# Now, we can setup a line chart figure
fig, ax = plt.subplots(figsize=(10, 5))

for age_range in custom_group.columns:
    ax.plot(custom_group.index, custom_group[age_range], label=age_range, marker='o')

# Add labels and title
ax.set_xlabel('Year')
ax.set_ylabel('Number of Employees')
ax.set_title('Number of Employees by Age Range Over Years')

# Move the legend outside to the right
ax.legend(loc='upper left', bbox_to_anchor=(1, 1))

plt.tight_layout()

# Export the plot to an image file
plt.savefig('age_range_line_chart.png', transparent =True)

plt.show()