# Dashboard Historical student data

## 1. Most common pathways taken:


In [134]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output

### Data cleaning: (This can be extracted to an object or just create a funciton that is call once and outputs a clean file, while the process is not define I will just add the cleaning functions to this file)

## Questions for Danny:
* What to do when I have an NA in a DateTime series?

In [135]:
# Cleaning and testing function
def Enrollments_cleaning(df: pd.DataFrame) -> pd.DataFrame:
    # Columns to clean
    COLUMNS_TO_DROP = ['Full Name']
    result = df.drop(columns=COLUMNS_TO_DROP)
    
    # TODO: fix 'ATP Cohorts' 
        # Q: how do I handle 'NA' in a datetime series
    
    # Fix NaN values
    NAN_VALUE_SUBSTITUTE = 'NA'
    columns_to_fix = {
        'Projected Start Date': NAN_VALUE_SUBSTITUTE, 'Actual Start Date': NAN_VALUE_SUBSTITUTE, 'Projected End Date': NAN_VALUE_SUBSTITUTE,
        'Actual End Date': NAN_VALUE_SUBSTITUTE, 'Outcome': NAN_VALUE_SUBSTITUTE, 'ATP Cohort': NAN_VALUE_SUBSTITUTE
    }

    for column, substitute_value in columns_to_fix.items():
        result[column] = result[column].fillna(substitute_value)

    # Fix change name Data Analitics 2 to Data Analysis 2 for consistency
    result.loc[result['Service'] == 'Data Analytics 2', 'Service'] = 'Data Analysis 2'

    # Delete values not needed 
    values_not_needed = {
        'Service': ['Software Development 1', 'Software Development 2', 'Web Development 1', 'Web Development 2', 'Data Analysis 1','Data Analysis 2']
    }
    for column, value in values_not_needed.items():
        result = result[~result[column].isin(value)]

    # DataTypes
    column_datatype: dict = {'Auto Id': str, 'KY Region': str, 'Assessment ID': str, 'EnrollmentId': str,
       'Enrollment Service Name': str, 'Service': str, 'Projected Start Date': str,
       'Actual Start Date': str, 'Projected End Date': str, 'Actual End Date': str, 'Outcome': str,
       'ATP Cohort': str} 
    # TODO: 'Projected Start Date', 'Actual Start Date', 'Projected End Date', 'Actual End Date' are all datetime types but have a value fix of NA
    
    for column, type in column_datatype.items():
        result[column] = result[column].astype(type)

    # Added the tests inside the cleaning function because it cannot be on a separeted folder structure until testing methods are define
    Test_enrollments_cleaning(result)

    return result

def Test_enrollments_cleaning(clean_df: pd.DataFrame):
    # Parameter can be change to an in-function call of the data cleanner with the DF
    assert ~clean_df.isna().any().any(), 'The Dataframe has na values.'

In [None]:
enrollments = pd.read_excel('Data\\Raw\\ARC Enrollments.xlsx')
enrollments = Enrollments_cleaning(enrollments)
enrollments

Index(['Auto Id', 'KY Region', 'Assessment ID', 'EnrollmentId',
       'Enrollment Service Name', 'Service', 'Projected Start Date',
       'Actual Start Date', 'Projected End Date', 'Actual End Date', 'Outcome',
       'ATP Cohort'],
      dtype='object')

### 1.1 Most common path by period
- Periods are going to be define by ATP Cohort

In [137]:
# Starter pathways are the only path that have to be taken into consideration for each period student pathway choosing
# TODO: maybe make them generate automatically 
Starter_pathways = [
    'Web Development M1',
    'Data Analysis M1', 
    'Software Development M1',
    'Quality Assurance M1', 
    'User Experience M1'
]

In [138]:
# What are the NA values in 'ATP Cohort'?
enrollments[enrollments['ATP Cohort'] == 'NA'].value_counts('Service')

# looks like pathways are not represented in ATP Cohort NA Values (Probably has more to do with support entries)
# only 6 of the 460 ATP cohort 'NA' values are pathways

Service
Career Readiness Workshop           224
One-on-one Job Readiness             87
Remote Jobs Workshop (EDA Grant)     30
Referral to External Service         20
Supportive Services Referral         20
Tech Communications Workshop         20
Employer Event (Code:You)            14
Demo Day Participant                 12
Remote Jobs Workshop (non EDA)        8
Resume Review and Optimization        6
Technical Project Management          6
Interview Guidance and Practice       5
Data Analysis M4                      2
Revised Resume Review                 2
Web Development M4                    2
Software Development M4               1
Name: count, dtype: int64

In [139]:
def Get_starting_pathways(df: pd.DataFrame): 
    """
        Returns a pandas.DataFrame were all the services are the biginning paths

        Args: 
            df: pandas.DataFrame

        Return:
            pandas.DataFrame
    """
    Starter_pathways = [
        'Web Development M1',
        'Data Analysis M1', 
        'Software Development M1',
        'Quality Assurance M1', 
        'User Experience M1',
    ]
    mask_starter_pathways = df['Service'].isin(Starter_pathways)
    return df[mask_starter_pathways]

def Get_cohorts_list(df: pd.DataFrame):
    cohorts = list(pd.to_datetime(df['ATP Cohort'][df['ATP Cohort'] != 'NA']).sort_values(ascending=True).unique())
    cohorts.insert(0, 'All cohorts')
    return cohorts

def Get_data_by_cohort(df: pd.DataFrame, cohort: str = 'All cohorts') -> pd.DataFrame:
    if cohort == 'All cohorts':
        result = df.value_counts('Service').reset_index()
    else:
        result = df[df['ATP Cohort'] == pd.Timestamp(cohort)].value_counts('Service').reset_index()
    
    return result

In [140]:
# TODO: Finish tests
def Test_Get_starting_pathways():
    mock_data = pd.DataFrame(
        {
            'Auto Id': ['202303-11274', '202206-8668', '202110-5639', '202410-17704', '202208-9220'], 
            'KY Region': ['SOAR', 'SOAR', 'SOAR', 'SOAR', 'SOAR'], 
            'Assessment ID': ['OA-010946', 'OA-016863', 'OA-004316', 'OA-019195', 'OA-008218'], 
            'EnrollmentId': ['Enrollment-7415', 'Enrollment-9631', 'Enrollment-5320', 'Enrollment-11703', 'Enrollment-6405'], 
            'Enrollment Service Name': ['ES-0021827', 'ES-0024437', 'ES-0022632', 'ES-0029379', 'ES-0016248'], 
            'Service': ['Web Development M1', 'Intro to Programming Core', 'Intro to Programming Core', 'Intro to Programming Core', 'Salesforce'], 
            'Projected Start Date': ['2024-03-11 00:00:00', '2024-08-15 00:00:00', '2024-05-06 00:00:00', '2025-01-07 00:00:00', '2022-09-07 00:00:00'], 
            'Actual Start Date': ['2024-03-11 00:00:00', 'NA', '2024-05-06 00:00:00', '2025-01-13 00:00:00', 'NA'], 
            'Projected End Date': ['2024-04-12 00:00:00', 'NA', 'NA', '2025-02-18 00:00:00', 'NA'], 
            'Actual End Date': ['2024-04-12 00:00:00', '2024-09-29 00:00:00', '2024-06-28 00:00:00', '2025-02-04 00:00:00', '2022-11-18 00:00:00'], 
            'Outcome': ['Successfully Completed', 'Successfully Completed', 'Successfully Completed', 'Did Not Complete', 'Successfully Completed'], 
            'ATP Cohort': ['2024-01-01 00:00:00', '2024-08-01 00:00:00', '2024-05-01 00:00:00', '2025-01-01 00:00:00', '2022-09-01 00:00:00']
        }
    )
    return mock_data
Test_Get_starting_pathways()

Unnamed: 0,Auto Id,KY Region,Assessment ID,EnrollmentId,Enrollment Service Name,Service,Projected Start Date,Actual Start Date,Projected End Date,Actual End Date,Outcome,ATP Cohort
0,202303-11274,SOAR,OA-010946,Enrollment-7415,ES-0021827,Web Development M1,2024-03-11 00:00:00,2024-03-11 00:00:00,2024-04-12 00:00:00,2024-04-12 00:00:00,Successfully Completed,2024-01-01 00:00:00
1,202206-8668,SOAR,OA-016863,Enrollment-9631,ES-0024437,Intro to Programming Core,2024-08-15 00:00:00,,,2024-09-29 00:00:00,Successfully Completed,2024-08-01 00:00:00
2,202110-5639,SOAR,OA-004316,Enrollment-5320,ES-0022632,Intro to Programming Core,2024-05-06 00:00:00,2024-05-06 00:00:00,,2024-06-28 00:00:00,Successfully Completed,2024-05-01 00:00:00
3,202410-17704,SOAR,OA-019195,Enrollment-11703,ES-0029379,Intro to Programming Core,2025-01-07 00:00:00,2025-01-13 00:00:00,2025-02-18 00:00:00,2025-02-04 00:00:00,Did Not Complete,2025-01-01 00:00:00
4,202208-9220,SOAR,OA-008218,Enrollment-6405,ES-0016248,Salesforce,2022-09-07 00:00:00,,,2022-11-18 00:00:00,Successfully Completed,2022-09-01 00:00:00


In [141]:
def Dash_most_selected_path_by_cohort(data: pd.DataFrame) -> Dash: # Need to pass the dataframe argument because of how the Data is structure
    app = Dash(__name__)
    # Const
    starter_only_enrollments = Get_starting_pathways(data) # This function should be able to comunicate with the data without argument

    dropdown_options = Get_cohorts_list(starter_only_enrollments)
    pathway_color = {
        'Web Development M1': 'blue',
        'Data Analysis M1': 'red', 
        'Software Development M1': 'green',
        'Quality Assurance M1': 'yellow', 
        'User Experience M1': 'purple'
    }

    # Display
    app.layout = html.Div([
        html.H2('Cohorts', style={'text-align': "center"}),
        html.P('Select Cohort:'),
        dcc.Dropdown(
            id="dropdown",
            options=dropdown_options,
            value=dropdown_options[0],
            clearable=False,
        ),
        dcc.Graph(id="graph")
        
    ], style={'backgroundColor':'white'})

    @app.callback(
        Output("graph", "figure"),
        Input("dropdown", "value"))

    # Graph
    def tt(time):
        df = Get_data_by_cohort(starter_only_enrollments, time)
        fig = px.pie(df, names='Service', values='count', color='Service', color_discrete_map=pathway_color)
        return fig

    return app

    # TODO: Add number of students per each cohort 
    # TODO: Fix the options on the selection 
    # TODO: make colors better

Dash_most_selected_path_by_cohort(enrollments).run(debug=True, port=8052)

### 2. Completion rates

### 2.1 completion rate by path

In [142]:
pathways = [
    'Web Development M1',
    'Web Development M2',
    'Web Development M3',
    'Web Development M4',
    'Data Analysis M1', 
    'Data Analysis M2',
    'Data Analysis M3',
    'Data Analysis M4', 
    'Software Development M1',
    'Software Development M2',
    'Software Development M3',
    'Software Development M4',
    'Quality Assurance M1', 
    'Quality Assurance M2',
    'Quality Assurance M3', 
    'Quality Assurance M4', 
    'User Experience M1', 
    'User Experience M2',
    'User Experience M3', 
    'User Experience M4',
]

In [143]:
# Not the best Pandas way to do it:
def Get_completion_percentages(df: pd.DataFrame, cohort: str = 'All cohorts') -> pd.DataFrame:
    if cohort == 'All cohorts':
        data = df
    else:
        data = df[df['ATP Cohort'] == pd.Timestamp(cohort)]

    completion_dictionary = {}

    for path in pathways:
        outcome = data[data['Service'] == path]['Outcome'].value_counts(normalize=True).reset_index()
        completion_dictionary[path] = {row.Outcome: row.proportion for row in outcome.itertuples(index=True)}
    
    result_df = pd.DataFrame(completion_dictionary).transpose().fillna(0).rename_axis('Module').reset_index()

    result_df['Pathway'] = result_df['Module'].apply(lambda x: x[:x.rfind(' ')]) # intended to be able to sort by pathway
    return result_df
# TODO: Add test

def Get_pathways_name(df: pd.DataFrame) -> list:
    return list(df['Pathway'].unique())



In [144]:
Get_completion_percentages(enrollments)

Unnamed: 0,Module,Successfully Completed,Did Not Complete,Partially Completed,NA,Pathway
0,Web Development M1,0.791667,0.15625,0.052083,0.0,Web Development
1,Web Development M2,0.807692,0.153846,0.038462,0.0,Web Development
2,Web Development M3,0.59375,0.09375,0.0,0.3125,Web Development
3,Web Development M4,0.675,0.325,0.0,0.0,Web Development
4,Data Analysis M1,0.777778,0.157407,0.064815,0.0,Data Analysis
5,Data Analysis M2,0.604651,0.27907,0.093023,0.023256,Data Analysis
6,Data Analysis M3,0.634615,0.076923,0.038462,0.25,Data Analysis
7,Data Analysis M4,0.575758,0.393939,0.030303,0.0,Data Analysis
8,Software Development M1,0.8,0.181818,0.018182,0.0,Software Development
9,Software Development M2,0.744186,0.209302,0.046512,0.0,Software Development


In [145]:
def Dash_completion_rates_by_path(df: pd.DataFrame) -> Dash: # TODO: fix data structure so visualization doesn't use df
    app2 = Dash(__name__)
    # Const
    completion_df = Get_completion_percentages(df)
    options = Get_pathways_name(completion_df)

    pathway_color = {
        'Software Development': 'green', 
        'Web Development': 'blue', 
        'Data Analysis': 'red',
        'Quality Assurance': 'yellow', 
        'User Experience': 'purple'
    }

    # Display
    app2.layout = html.Div([
        html.H2('Pathways Completion', style={'text-align': "center"}),
        html.P('Select pathway:'),
        dcc.Dropdown(
            id="dropdown",
            options=options,
            value=options[0],
            clearable=False,
        ),
        dcc.Graph(id="graph")
        
    ], style={'backgroundColor':'white'})

    @app2.callback(
        Output("graph", "figure"),
        Input("dropdown", "value"))

    # Graph
    # TODO: Need to add an extra selection box with the cohorts
    def Display_pathway_completion(p):
        df = completion_df[completion_df['Pathway'] == p]
        fig = px.bar(df, x='Module', y='Successfully Completed')
        return fig

    return app2

Dash_completion_rates_by_path(enrollments).run(debug=True, port=8053)

In [146]:
# Just testing
completion = Get_completion_percentages(enrollments)
px.bar(completion, x='Module', y='Successfully Completed').show()