### **Table of Contents**
  * [read in data](#read-in-data)
  * [Update cleaning code](#update-cleaning-code)
  * [Generate report](#generate-report)
  * [Plots](#plots)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import dash
import os
import sys

## read in data
Psudo code:
- read in all the files in the data folder 
  - accounting for them being in xlsx or csv 
- dataframe variable name should end up being file name minus extension

- This allows us to just drop in any export with any name and it should run. 

In [None]:
# for data in sys.path:
#   if data.emndswith('.xlsx') or data.endswith('.csv'):
#     df = pd.read_excel(data) if data.endswith('.xlsx') else pd.read_csv(data)
#     print(f"Data loaded from: {data}")
#     break

## Update cleaning code 
Look at our cleaning code that we have. 
we should start to make changes to it to account for this. 
We need to make it so it so the program doesn't crash when something fails 
  - [Try Except logic updates](https://www.w3schools.com/python/python_try_except.asp)
  - make the messages mean something meaningful
Ideally we will not drop anything from our data 

In [None]:
class DemographicsCleaning:
    """
    A class for cleaning and preprocessing demographic data.

    Provides methods to:
    - Remove unused or mostly null columns
    - Normalize gender values
    - Split the 'Race' column into multiple race columns
    - Drop duplicate rows
    """

    @staticmethod
    def remove_unused_columns(df: pd.DataFrame) -> pd.DataFrame:
        """
        Remove columns with mostly null values or unnecessary information.

        Args:
            df (pd.DataFrame): Input dataframe containing demographic data.

        Returns:
            pd.DataFrame: Dataframe with specified columns removed.
        """
        columns_to_drop = [
            'First Name', 'Last Name', 'Ethnicity Hispanic/Latino',
            'Single Parent', 'Ex-Offender', 'Program: Program Name', 'Outcome'
        ]
        return df.drop(columns=columns_to_drop, errors='ignore')

    @staticmethod
    def normalize_gender(df: pd.DataFrame) -> pd.DataFrame:
        """
        Normalize gender values by combining 'Transgender male to female'
        and 'Transgender female to male' into a single 'Transgender' category.

        Args:
            df (pd.DataFrame): Input dataframe containing a 'Gender' column.

        Returns:
            pd.DataFrame: Dataframe with normalized gender values.
        """
        df['Gender'] = df['Gender'].replace({
            'Transgender male to female': 'Transgender',
            'Transgender female to male': 'Transgender'
        })
        return df

    @staticmethod
    def split_race_column(df: pd.DataFrame) -> pd.DataFrame:
        """
        Split the 'Race' column into multiple columns
        if multiple races are selected.

        Args:
            df (pd.DataFrame): Input dataframe containing a 'Race' column.

        Returns:
            pd.DataFrame: Dataframe with new columns Race_1, Race_2, etc.
        """
        splitting = df['Race'].str.split(';', expand=True)
        splitting.columns = [f'Race_{i+1}' for i in range(splitting.shape[1])]
        df = pd.concat([df.drop(columns=['Race']), splitting], axis=1)
        return df

    @staticmethod
    def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
        """
        Remove duplicate rows from the dataframe.

        Args:
            df (pd.DataFrame): Input dataframe.

        Returns:
            pd.DataFrame: Dataframe without duplicate rows.
        """
        return df.drop_duplicates()

    @classmethod
    def clean(cls, df: pd.DataFrame) -> pd.DataFrame:
        """
        Perform the full data cleaning process on demographics data.

        Steps include:
        - Removing unused or mostly null columns
        - Normalizing gender values
        - Splitting the 'Race' column into multiple race columns
        - Dropping duplicate rows

        Args:
            df (pd.DataFrame): Raw demographics dataframe.

        Returns:
            pd.DataFrame: Cleaned dataframe ready for analysis.
        """
        df = cls.remove_unused_columns(df)
        df = cls.normalize_gender(df)
        df = cls.split_race_column(df)
        df = cls.drop_duplicates(df)
        return df


class WorceCleaning:
    """
    A placeholder for a class that can be used to clean Worce data.
    This class can be extended in the future to include specific cleaning methods.
    """
    @staticmethod
    def clean(df: pd.DataFrame) -> pd.DataFrame:
        """
        Placeholder method for cleaning Worce data.
        Currently does nothing but can be extended in the future.

        Args:
            df (pd.DataFrame): Input dataframe containing Worce data.

        Returns:
            pd.DataFrame: Unchanged dataframe.
        """
    pass



## Generate report 

- Overall completion of program only accounting for the new style of classes m1-m4
- completion by year 
- completion over all by pathway 
- completion by year by pathway 
- Feel free to get creative here adding gender etc to get us a better understanding 
- education level and the above... 
- export this as a txt file 

## Plots 
- Look at the various plots 
- make a consistent color scheme
- pick the plots that go with the report above 
- make missing plots 


In [None]:
def plot_salary_by_gender(data):
    plt.figure(figsize=(8, 5))
    sns.boxplot(data=data, x='Gender', y='Salary')
    plt.title("Salary Distribution by Gender")
    plt.show()


def plot_avg_salary_by_city(data):
    region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()
    region_salary.plot(kind='barh', figsize=(8, 5), title="Average Salary by KY Region")
    plt.xlabel("Average Salary")
    plt.show()


def plot_placements_over_time(data):
    data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))
    plt.title("Number of Placements Over Time")
    plt.ylabel("Placements")
    plt.show()


def plot_placement_type_by_program(data):
    plt.figure(figsize=(10, 6))
    sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')
    plt.xticks(rotation=45)
    plt.title("Placement Type by Program")
    plt.show()


def plot_top_cities(data):
    city_counts = data['Mailing City'].value_counts().head(10)
    city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))
    plt.ylabel("Count")
    plt.show()

In [1]:
# %% [markdown]
# # Visualization examples
# 
# Visualizion was not turn into a class because the project will use Google Locker for dashboard creation, this notebook only works to showcase how to use the Data Manipulation classes.

# %% [markdown]
# ## Imports

# %%
import pandas as pd
import plotly.express as px
import os
import sys
parent_dir = os.path.abspath("..")
sys.path.append(parent_dir)
from dash import Dash, dcc, html, Input, Output
from most_common_pathways_taken_data import Most_common_pathways_taken_data
from completion_rate_data import Completion_rate_data
from cleaning_enrollments_data import EnrollmentsCleaning

# %% [markdown]
# ## Cleaning data
# 
# This step should be done before the use of any of the Data classes

# %%
cleaner = EnrollmentsCleaning(pd.read_excel('../../data/ARC_Enrollments.xlsx'))


# %% [markdown]
# ## Most common pathway taken:

# %%
def Dash_most_selected_path_by_cohort() -> Dash: # Need to pass the dataframe argument because of how the Data is structure
    app = Dash(__name__)
    # Const
    data_class = Most_common_pathways_taken_data(cleaner.Get_clean_data())

    dropdown_options = data_class.Get_cohorts_list()
    pathway_color = {
        'Web Development M1': 'blue',
        'Data Analysis M1': 'red', 
        'Software Development M1': 'green',
        'Quality Assurance M1': 'yellow', 
        'User Experience M1': 'purple'
    }

    # Display
    app.layout = html.Div([
        html.H2('Cohorts', style={'text-align': "center"}),
        html.P('Select Cohort:'),
        dcc.Dropdown(
            id="dropdown",
            options=dropdown_options,
            value=dropdown_options[0],
            clearable=False,
        ),
        dcc.Graph(id="graph")
        
    ], style={'backgroundColor':'white'})

    @app.callback(
        Output("graph", "figure"),
        Input("dropdown", "value"))

    # Graph
    def tt(time):
        df = data_class.Get_data_by_cohort(time)
        fig = px.pie(df, names='Service', values='count', color='Service', color_discrete_map=pathway_color)
        return fig

    return app

    # TODO: Add number of students per each cohort 
    # TODO: Fix the options on the selection 
    # TODO: make colors better

Dash_most_selected_path_by_cohort().run(debug=True, port=8052)

# %% [markdown]
# ## Compleation rates:

# %%
def Dash_completion_rates_by_path() -> Dash: # TODO: fix data structure so visualization doesn't use df
    app2 = Dash(__name__)
    # Const
    data_class = Completion_rate_data(cleaner.Get_clean_data())
    completion_df = data_class.Get_completion_percentages().round(2)
    options = data_class.Get_pathways_name(completion_df)

    # Display
    app2.layout = html.Div([
        html.H2('Pathways Completion', style={'text-align': "center"}),
        html.P('Select pathway:'),
        dcc.Dropdown(
            id="dropdown",
            options=options,
            value=options[0],
            clearable=False,
        ),
        dcc.Graph(id="graph")
        
    ], style={'backgroundColor':'white'})

    @app2.callback(
        Output("graph", "figure"),
        Input("dropdown", "value"))

    # Graph
    # TODO: Need to add an extra selection box with the cohorts
    def Display_pathway_completion(p):
        df = completion_df[completion_df['Pathway'] == p]
        fig = px.bar(df, x='Module', y='Successfully Completed')
        return fig

    return app2

Dash_completion_rates_by_path().run(debug=True, port=8053)




ModuleNotFoundError: No module named 'most_common_pathways_taken_data'

TOC generator 

In [4]:
import json
import os


def generate_toc_from_notebook(notebook_path):
    """
    Parses a local .ipynb file and generates Markdown for a Table of Contents.
    """
    if not os.path.isfile(notebook_path):
        print(f"❌ Error: File not found at '{notebook_path}'")
        return

    with open(notebook_path, 'r', encoding='utf-8') as f:
        notebook = json.load(f)

    toc_markdown = "### **Table of Contents**\n"
    for cell in notebook.get('cells', []):
        if cell.get('cell_type') == 'markdown':
            for line in cell.get('source', []):
                if line.strip().startswith('#'):
                    level = line.count('#')
                    title = line.strip('#').strip()
                    link = title.lower().replace(' ', '-').strip('-.()')
                    indent = '  ' * (level - 1)
                    toc_markdown += f"{indent}* [{title}](#{link})\n"

    print("\n--- ✅ Copy the Markdown below and paste it "
          "into a new markdown cell ---\n")
    print(toc_markdown)


notebook_path = 'ideal.ipynb'
generate_toc_from_notebook(notebook_path)



--- ✅ Copy the Markdown below and paste it into a new markdown cell ---

### **Table of Contents**
  * [read in data](#read-in-data)
  * [Update cleaning code](#update-cleaning-code)
  * [Generate report](#generate-report)
  * [Plots](#plots)

