In [70]:
import pandas as pd
import numpy as np
import plotly as pt

## Cleaning Enrollments

In [71]:
df = pd.read_excel("data/ARC Enrollments.xlsx")

df.head()

Unnamed: 0,Auto Id,KY Region,Full Name,Assessment ID,EnrollmentId,Enrollment Service Name,Service,Projected Start Date,Actual Start Date,Projected End Date,Actual End Date,Outcome,ATP Cohort
0,202109-5224,SOAR,name name,OA-003348,Enrollment-1386,ES-0011193,Career Readiness Workshop,2021-11-11,NaT,NaT,NaT,,NaT
1,202109-5224,SOAR,name name,OA-003348,Enrollment-1386,ES-0013492,Software Development 1,2022-01-05,2022-01-05,2022-04-06,2022-04-06,Successfully Completed,2022-01-01
2,202109-5224,SOAR,name name,OA-003348,Enrollment-1386,ES-0014187,Career Readiness Workshop,2022-03-07,NaT,NaT,NaT,,NaT
3,202109-5224,SOAR,name name,OA-003348,Enrollment-1386,ES-0015022,Software Development 2,2022-05-04,2022-05-04,2022-07-29,2022-07-29,Successfully Completed,2022-05-01
4,202109-5224,SOAR,name name,OA-003348,Enrollment-1386,ES-0015075,Web Development 1,2021-09-08,2021-09-08,2021-12-14,2021-12-14,Successfully Completed,2021-09-01


## Removal, Renaming, and Redundancy

##### Column Removal:

In [72]:
# remove 'Full Name', 'Assessment ID', 'EnrollmentId', 'Enrollment Service Name', 'Service', 'Projected Start Date', 'Actual Start Date', 'Projected End Date', 'Actual End Date', and 'ATP Cohort'
columnsToRemove = [
    "Full Name",
    "Assessment ID",
    "EnrollmentId",
    "Enrollment Service Name",
    "Projected Start Date",
    "Actual Start Date",
    "Projected End Date",
    "Actual End Date",
    "ATP Cohort"
]

cols_gone_df = df.drop(columnsToRemove, axis=1)

display(cols_gone_df)

Unnamed: 0,Auto Id,KY Region,Service,Outcome
0,202109-5224,SOAR,Career Readiness Workshop,
1,202109-5224,SOAR,Software Development 1,Successfully Completed
2,202109-5224,SOAR,Career Readiness Workshop,
3,202109-5224,SOAR,Software Development 2,Successfully Completed
4,202109-5224,SOAR,Web Development 1,Successfully Completed
...,...,...,...,...
2028,202504-21723,SOAR,Intro to Programming Core,
2029,202505-22788,SOAR,Intro to Programming Core,
2030,202408-16568,SOAR,Intro to Programming Core,Did Not Complete
2031,202408-16568,SOAR,Supportive Services Referral,


##### Post-Removal Notes

There are a **ton** of Auto ID's that match up with the different services.  Additionally, 'NaN' results should be removed from 'Outcome' as this list is completely based off of this.

In [73]:
nan_drop_df = cols_gone_df.dropna(subset=["Outcome"]) # dropping NaN as they aren't relevant to the information we're looking for.

display(nan_drop_df)

Unnamed: 0,Auto Id,KY Region,Service,Outcome
1,202109-5224,SOAR,Software Development 1,Successfully Completed
3,202109-5224,SOAR,Software Development 2,Successfully Completed
4,202109-5224,SOAR,Web Development 1,Successfully Completed
5,202109-5230,SOAR,Web Development 1,Did Not Complete
7,202109-5233,SOAR,Web Development 2,Successfully Completed
...,...,...,...,...
2008,202303-11035,SOAR,Intro to Programming Core,Partially Completed
2009,202503-21188,SOAR,Intro to Programming Core,Did Not Complete
2020,202501-19999,SOAR,Intro to Programming Core,Partially Completed
2024,202503-20923,SOAR,Intro to Programming Core,Did Not Complete


Dropping ''Software Development 1', 'Web Development 1', 'Data Analysis 1' because 'times have changed'

In [74]:
to_drop = [
    'Software Development 1', 
    'Web Development 1', 
    'Data Analysis 1'
]

ones_removed_df = nan_drop_df[~nan_drop_df['Service'].isin(to_drop)] # dropping rows in the above variable as they're vestigial

display(ones_removed_df)

Unnamed: 0,Auto Id,KY Region,Service,Outcome
3,202109-5224,SOAR,Software Development 2,Successfully Completed
7,202109-5233,SOAR,Web Development 2,Successfully Completed
14,202109-5237,SOAR,Data Analytics 2,Successfully Completed
22,202109-5243,SOAR,Software Development 2,Successfully Completed
25,202109-5243,SOAR,Data Analytics 2,Successfully Completed
...,...,...,...,...
2008,202303-11035,SOAR,Intro to Programming Core,Partially Completed
2009,202503-21188,SOAR,Intro to Programming Core,Did Not Complete
2020,202501-19999,SOAR,Intro to Programming Core,Partially Completed
2024,202503-20923,SOAR,Intro to Programming Core,Did Not Complete


##### Renaming the 'Auto Id' column

In [75]:
renamed_df = ones_removed_df.rename(columns={'Auto Id': 'Auto ID'})

renamed_df

Unnamed: 0,Auto ID,KY Region,Service,Outcome
3,202109-5224,SOAR,Software Development 2,Successfully Completed
7,202109-5233,SOAR,Web Development 2,Successfully Completed
14,202109-5237,SOAR,Data Analytics 2,Successfully Completed
22,202109-5243,SOAR,Software Development 2,Successfully Completed
25,202109-5243,SOAR,Data Analytics 2,Successfully Completed
...,...,...,...,...
2008,202303-11035,SOAR,Intro to Programming Core,Partially Completed
2009,202503-21188,SOAR,Intro to Programming Core,Did Not Complete
2020,202501-19999,SOAR,Intro to Programming Core,Partially Completed
2024,202503-20923,SOAR,Intro to Programming Core,Did Not Complete


##### Dropping SOAR because all are in this region

In [76]:
drop_region_df = renamed_df.drop(columns=["KY Region"])

drop_region_df

Unnamed: 0,Auto ID,Service,Outcome
3,202109-5224,Software Development 2,Successfully Completed
7,202109-5233,Web Development 2,Successfully Completed
14,202109-5237,Data Analytics 2,Successfully Completed
22,202109-5243,Software Development 2,Successfully Completed
25,202109-5243,Data Analytics 2,Successfully Completed
...,...,...,...
2008,202303-11035,Intro to Programming Core,Partially Completed
2009,202503-21188,Intro to Programming Core,Did Not Complete
2020,202501-19999,Intro to Programming Core,Partially Completed
2024,202503-20923,Intro to Programming Core,Did Not Complete


##### Figuring out what to do with multiple Auto ID's

* Could merge them

In [77]:
# _df.to_csv('data/cleaned/cleaned enrollment.csv', index=False)