# Headline goes here

By [Iris Lee](https://www.latimes.com/people/iris-lee)

Summary of the findings goes here with a link to the story

## Configure

In [1]:
%load_ext nb_black 

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import altair as alt
import altair_latimes as lat

<IPython.core.display.Javascript object>

In [3]:
pd.set_option('display.max_columns', None)

<IPython.core.display.Javascript object>

In [4]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

<IPython.core.display.Javascript object>

## Import

Read in data manually parsed from PDFs provided by the Los Angeles Unified School District.

In [5]:
grades_df = pd.read_csv("input/all-grades-complete.csv")

<IPython.core.display.Javascript object>

In [6]:
groups_high_df = pd.read_csv("input/grades9-12-complete.csv")

<IPython.core.display.Javascript object>

In [7]:
groups_middle_df = pd.read_csv("input/grades6-8-complete.csv")

<IPython.core.display.Javascript object>

## Transform

Clean up column headers

In [8]:
columns_crosswalk = {
    'type': 'type',
    'groupname': 'group_name',
    'Grades': "grades",
    'Semester School Year': "semester_name",
    '# of As': "a_count",
    "# of A's": "a_count",
    "# of B's": "b_count",
    "# of C's": "c_count",
    "# of D's": "d_count",
    "# of F's": "f_count",
    "# of P's": "p_count",
    "# of N's": "n_count",
    "# of I's": "i_count",
    'Total Marks': "marks_count",
    '% A': "a_percent",
    '% B': "b_percent",
    '% C': "c_percent",
    'A-C %': "a_to_c_percent",
    '% D': "d_percent",
    '% F': "f_percent",
    '% P': "p_percent",
    '% N': "n_percent",
    '% I': "i_percent"
}

<IPython.core.display.Javascript object>

In [9]:
grades_df.rename(columns=columns_crosswalk, inplace=True)

<IPython.core.display.Javascript object>

In [10]:
groups_high_df.rename(columns=columns_crosswalk, inplace=True)

<IPython.core.display.Javascript object>

In [11]:
groups_middle_df.rename(columns=columns_crosswalk, inplace=True)

<IPython.core.display.Javascript object>

Annotate with semester identifer

In [12]:
def parse_semester_id(x):
    """
    Create a unique identifer for the semester described by each row.
    """
    # Split the raw string
    parts = x.split()
    # Reformat it as a string we can count on
    return f"{parts[-1]}-{parts[0]}"

<IPython.core.display.Javascript object>

In [13]:
grades_df["semester_id"] = grades_df["semester_name"].apply(parse_semester_id)

<IPython.core.display.Javascript object>

In [14]:
groups_high_df["semester_id"] = groups_high_df["semester_name"].apply(parse_semester_id)

<IPython.core.display.Javascript object>

In [15]:
groups_middle_df["semester_id"] = groups_middle_df["semester_name"].apply(parse_semester_id)

<IPython.core.display.Javascript object>

Clean up data values

In [16]:
def safestr(val):
    """
    Cleans up the provided value and returns it.
    
    If it's a string, it cuts out a bunch of cruft found in the raw data.
    
    If it's not a string, it doensn't do anything.
    """
    return (
     str(val).replace(",", "")
        .replace("%", "")
        .replace("‐", "")
        .replace(" ", "")
        .strip()
        .lower()
    )

<IPython.core.display.Javascript object>

In [17]:
def clean_df(df, str_columns=["grades", "semester_name", "semester_id", "type", "group_name"]):
    """
    Cleans the provided dataframe.
    """
    # Tidy all the strings
    df = df.applymap(safestr)
    # Convert all the numbers
    num_columns = [c for c in df.columns if c not in str_columns]
    df[num_columns] = df[num_columns].apply(pd.to_numeric, errors="coerce")
    # Pass it back
    return df

<IPython.core.display.Javascript object>

In [18]:
grades_df = clean_df(grades_df)

<IPython.core.display.Javascript object>

In [19]:
groups_high_df = clean_df(groups_high_df)

<IPython.core.display.Javascript object>

In [20]:
groups_middle_df = clean_df(groups_middle_df)

<IPython.core.display.Javascript object>

## Filter

We only want to analyze the fall semester. _Explain why here_

In [21]:
def filter_to_fall(df):
    """
    Filter down the provided dataframe to only the fall semester.
    """
    return df[df.semester_id.str.contains("fall")].sort_values("semester_id")

<IPython.core.display.Javascript object>

In [22]:
fall_grades_df = filter_to_fall(grades_df)

<IPython.core.display.Javascript object>

In [23]:
fall_groups_high_df = filter_to_fall(groups_high_df)

<IPython.core.display.Javascript object>

In [24]:
fall_groups_middle_df = filter_to_fall(groups_middle_df)

<IPython.core.display.Javascript object>

## Aggregate

Add up the totals for each semester.

In [25]:
agg_columns = ["semester_id", "a_count", "b_count", "c_count", "d_count", "f_count"]

<IPython.core.display.Javascript object>

In [26]:
semester_counts = fall_grades_df[agg_columns].groupby("semester_id").sum()

<IPython.core.display.Javascript object>

In [27]:
semester_counts

Unnamed: 0_level_0,a_count,b_count,c_count,d_count,f_count
semester_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-2019-fall,523006,383747,319368,157246,156838.0
2019-2020-fall,527677,380665,313048,153262,146598.0
2020-2021-fall,517985,270268,257781,236095,154518.0


<IPython.core.display.Javascript object>

Add up group totals

In [28]:
concat_columns = ["semester_id", "type", "group_name", "a_count", "b_count", "c_count", "d_count", "f_count"]

<IPython.core.display.Javascript object>

In [29]:
fall_groups_concat_df = pd.concat([fall_groups_high_df, fall_groups_middle_df])[concat_columns]

<IPython.core.display.Javascript object>

Recode Filipino as Asian

In [30]:
fall_groups_concat_df.group_name = fall_groups_concat_df.group_name.apply(lambda x: 'asian' if x == 'filipino' else x)

<IPython.core.display.Javascript object>

In [31]:
group_counts = fall_groups_concat_df.groupby(["type", "group_name", "semester_id"]).sum()

<IPython.core.display.Javascript object>

In [32]:
group_counts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,a_count,b_count,c_count,d_count,f_count
type,group_name,semester_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ethnicity,americanindian/alaskanative,2018-2019-fall,913,744,574,301,284.0
ethnicity,americanindian/alaskanative,2019-2020-fall,903,678,516,244,274.0
ethnicity,americanindian/alaskanative,2020-2021-fall,663,392,358,319,198.0
ethnicity,asian,2018-2019-fall,53527,22027,11002,4002,2880.0
ethnicity,asian,2019-2020-fall,52766,20988,10430,3673,2438.0


<IPython.core.display.Javascript object>

Calculate percentages

In [33]:
def calculate_percentages(df):
    """
    Calculate percentages for the provided dataframe. Then return it.
    """
    df = df.div(df.sum(axis=1), axis=0)
    df.columns = df.columns.str.replace("_count", "_percent")
    return df

<IPython.core.display.Javascript object>

In [34]:
semester_percentages = calculate_percentages(semester_counts)

<IPython.core.display.Javascript object>

In [35]:
semester_percentages

Unnamed: 0_level_0,a_percent,b_percent,c_percent,d_percent,f_percent
semester_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-2019-fall,0.339569,0.249153,0.207354,0.102094,0.101829
2019-2020-fall,0.346871,0.250232,0.205783,0.100747,0.096367
2020-2021-fall,0.360551,0.188124,0.179432,0.164338,0.107555


<IPython.core.display.Javascript object>

In [36]:
group_percentages = calculate_percentages(group_counts)

<IPython.core.display.Javascript object>

In [37]:
group_percentages.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,a_percent,b_percent,c_percent,d_percent,f_percent
type,group_name,semester_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ethnicity,americanindian/alaskanative,2018-2019-fall,0.324219,0.264205,0.203835,0.106889,0.100852
ethnicity,americanindian/alaskanative,2019-2020-fall,0.345315,0.259273,0.197323,0.093308,0.10478
ethnicity,americanindian/alaskanative,2020-2021-fall,0.343523,0.203109,0.185492,0.165285,0.102591
ethnicity,asian,2018-2019-fall,0.572861,0.235739,0.117747,0.042831,0.030823
ethnicity,asian,2019-2020-fall,0.584373,0.232438,0.11551,0.040678,0.027


<IPython.core.display.Javascript object>

## Visualize

In [38]:
semester_chart_df = semester_percentages.reset_index().melt(id_vars=["semester_id"], value_vars=semester_percentages.columns)

<IPython.core.display.Javascript object>

In [39]:
alt.Chart(semester_chart_df, title='Fall semester grades').mark_bar().encode(
    x=alt.X('semester_id:N', title="Semester"),
    y=alt.Y('value:Q', title="Percentage", axis=alt.Axis(format='%')),
    color=alt.Color('variable:N', title="Grade"),  
)

<IPython.core.display.Javascript object>

In [40]:
group_percentages['c_or_higher_percent'] = group_percentages[['a_percent', 'b_percent', 'c_percent']].sum(axis=1)

<IPython.core.display.Javascript object>

In [41]:
group_chart_df = group_percentages.reset_index()[[
    'semester_id',
    'type',
    'group_name',
    'c_or_higher_percent'
]]

<IPython.core.display.Javascript object>

In [42]:
group_chart_df = group_chart_df[
    (group_chart_df.semester_id.isin(['2019-2020-fall', '2020-2021-fall']))
    &  (group_chart_df.type == 'ethnicity')
]

<IPython.core.display.Javascript object>

In [43]:
alt.Chart(group_chart_df, title="Change by group").transform_filter(
    alt.datum.type == 'ethnicity'
).mark_point().encode(
    alt.X(
        'c_or_higher_percent:Q',
        title="C or higher",
        scale=alt.Scale(zero=False),
        axis=alt.Axis(grid=False, format='%')
    ),
    alt.Y(
        'group_name:N',
        title="",
        sort='-x',
        axis=alt.Axis(grid=True)
    ),
    color=alt.Color('semester_id:N', legend=alt.Legend(title="Year")),
).properties(
    height=alt.Step(20)
).configure_view(stroke="transparent")

<IPython.core.display.Javascript object>

## Export

In [44]:
semester_export = semester_counts.merge(semester_percentages, on="semester_id")

<IPython.core.display.Javascript object>

In [45]:
semester_export.to_csv("output/combined.csv")

<IPython.core.display.Javascript object>

In [46]:
group_chart_df.to_csv("output/combined-detail.csv", index=False)

<IPython.core.display.Javascript object>