# ARC Application - Data cleanning

All data cleaning will be in the function 'Enrollments cleaning'

Notes:
- 'KY Region' field all the values are 'SOAR'

Questions:
- Why are they 'Actual Start Date' fields as NA when there is a Actual End Date

In [37]:
import pandas as pd
import plotly.express as px

# Result:

In [38]:
def Enrollments_cleaning(df: pd.DataFrame) -> pd.DataFrame:
    # Columns to clean
    COLUMNS_TO_DROP = ['Full Name']
    enrollments = df.drop(columns=COLUMNS_TO_DROP)

    # Fix NaN values
    NAN_VALUE_SUBSTITUTE = 'NA'
    columns_to_fix = {
        'Projected Start Date': NAN_VALUE_SUBSTITUTE, 'Actual Start Date': NAN_VALUE_SUBSTITUTE, 'Projected End Date': NAN_VALUE_SUBSTITUTE,
        'Actual End Date': NAN_VALUE_SUBSTITUTE, 'Outcome': NAN_VALUE_SUBSTITUTE, 'ATP Cohort': NAN_VALUE_SUBSTITUTE
    }

    for column, substitute_value in columns_to_fix.items():
        enrollments[column] = enrollments[column].fillna(substitute_value)

    return enrollments

In [39]:
def Test_enrollments_cleaning(clean_df: pd.DataFrame):
    # Parameter can be change to an in-function call of the data cleanner with the DF
    assert ~clean_df.isna().any().any(), 'The Dataframe has na values.'

# Exploratory Data Analysis

In [40]:
enrollments = pd.read_excel('Data\\Raw\\ARC Enrollments.xlsx')
enrollments = Enrollments_cleaning(enrollments)
enrollments
Test_enrollments_cleaning(enrollments)


### Understanding the categories of data


In [41]:
pathways = [
    'Software Development 1', 
    'Software Development 2', 
    'Web Development 1', 
    'Web Development 2', 
    'Data Analysis 1',
    'Data Analytics 2',
    'Web Development M1',
    'Web Development M2',
    'Web Development M3',
    'Web Development M4',
    'Data Analysis M1', 
    'Data Analysis M2',
    'Data Analysis M3',
    'Data Analysis M4', 
    'Software Development M1',
    'Software Development M2',
    'Software Development M3',
    'Software Development M4',
    'Quality Assurance M1', 
    'Quality Assurance M2',
    'Quality Assurance M3', 
    'Quality Assurance M4', 
    'User Experience M1', 
    'User Experience M2',
    'User Experience M3', 
    'User Experience M4',
]

workshops = [
    'JavaScript - React',
    'AWS',
    'Salesforce',
    'UofL Cyber Security Certificate',
    'Intro to Programming Core',
    'Artificial Intelligence M1',
    'Technical Project Management',
]

support_ways = [
    'Career Readiness Workshop',
    'One-on-one Job Readiness',
    'Interview Guidance and Practice',
    'Remote Jobs Workshop (EDA Grant)',
    'Employer Event (Code:You)',
    'Loaner Laptop',
    'Demo Day Participant',
    'Tech Communications Workshop',
    'Remote Jobs Workshop (non EDA)',
    'Referral to External Service',
    'Supportive Services Referral', 
    'Resume Review and Optimization',
    'Revised Resume Review'
]

# This category is only used to find the path choose by students:
Starter_pathways = [
    'Software Development 1', 
    'Web Development 1', 
    'Data Analysis 1',
    'Web Development M1',
    'Data Analysis M1', 
    'Software Development M1',
    'Quality Assurance M1', 
    'User Experience M1',
]

In [42]:
enrollments['Category'] = ''

enrollments.loc[enrollments['Service'].isin(pathways), 'Category'] = 'Pathway'
enrollments.loc[enrollments['Service'].isin(workshops), 'Category'] = 'Workshop'
enrollments.loc[enrollments['Service'].isin(support_ways), 'Category'] = 'Student Support'

enrollments

Unnamed: 0,Auto Id,KY Region,Assessment ID,EnrollmentId,Enrollment Service Name,Service,Projected Start Date,Actual Start Date,Projected End Date,Actual End Date,Outcome,ATP Cohort,Category
0,202109-5224,SOAR,OA-003348,Enrollment-1386,ES-0011193,Career Readiness Workshop,2021-11-11 00:00:00,,,,,,Student Support
1,202109-5224,SOAR,OA-003348,Enrollment-1386,ES-0013492,Software Development 1,2022-01-05 00:00:00,2022-01-05 00:00:00,2022-04-06 00:00:00,2022-04-06 00:00:00,Successfully Completed,2022-01-01 00:00:00,Pathway
2,202109-5224,SOAR,OA-003348,Enrollment-1386,ES-0014187,Career Readiness Workshop,2022-03-07 00:00:00,,,,,,Student Support
3,202109-5224,SOAR,OA-003348,Enrollment-1386,ES-0015022,Software Development 2,2022-05-04 00:00:00,2022-05-04 00:00:00,2022-07-29 00:00:00,2022-07-29 00:00:00,Successfully Completed,2022-05-01 00:00:00,Pathway
4,202109-5224,SOAR,OA-003348,Enrollment-1386,ES-0015075,Web Development 1,2021-09-08 00:00:00,2021-09-08 00:00:00,2021-12-14 00:00:00,2021-12-14 00:00:00,Successfully Completed,2021-09-01 00:00:00,Pathway
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2028,202504-21723,SOAR,OA-022760,Enrollment-14196,ES-0035149,Intro to Programming Core,2025-05-12 00:00:00,2025-05-12 00:00:00,2025-06-27 00:00:00,,,2025-05-01 00:00:00,Workshop
2029,202505-22788,SOAR,OA-023710,Enrollment-14213,ES-0035212,Intro to Programming Core,2025-05-14 00:00:00,2025-05-13 00:00:00,2025-06-17 00:00:00,,,2025-05-01 00:00:00,Workshop
2030,202408-16568,SOAR,OA-017961,Enrollment-14833,ES-0036429,Intro to Programming Core,2025-05-12 00:00:00,,,,Did Not Complete,2025-05-01 00:00:00,Workshop
2031,202408-16568,SOAR,OA-017961,Enrollment-14833,ES-0036430,Supportive Services Referral,,,,,,,Student Support


In [43]:
pie_df = enrollments.value_counts('Category').reset_index()

In [44]:
fig = px.pie(pie_df, values='count', names='Category', title='Data structure by type of Service')
fig.show()

### Understanding 'ATP Cohort'

In [45]:
enrollments[enrollments['ATP Cohort'] == 'NA'].value_counts('Outcome')

Outcome
NA                        445
Successfully Completed     10
Did Not Complete            5
Name: count, dtype: int64