In [1]:
# setting the random seed for reproducibility
import random
random.seed(493)

# for manipulating dataframes
import pandas as pd
import numpy as np

# for statistical testing
from scipy import stats

# for working with timestamps
from datetime import datetime
from dateutil.parser import parse

# for visualizations
%matplotlib inline
import matplotlib.pyplot as plt

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Executive Summary

> **Something goes here.**

### PLANNING

- [X] Planning
- [X] Acquire data
- [ ] Prepare the data
- [ ] Explore the data
- [ ] Modeling

In [2]:
def show_missing(df):
    """
    Takes a dataframe and returns a dataframe with stats
    on missing and null values with their percentages.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    dfx = pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})
    return dfx

In [3]:
def get_values(df, columns):
    """
    Take a dataframe and a list of columns and
    returns the value counts for the columns.
    """
    for column in columns:
        print(column)
        print('=====================================')
        print(df[column].value_counts(dropna=False))
        print('\n')

def show_values(df, param):
    if param == 'all':
        get_values(df, df.columns)
    else:
        get_values(df, param)

In [4]:
def convert_datatype(df, column, totype):
    df[column] = df[column].astype(totype)
    return df

### ACQUIRE DATA

In [5]:
# Read csv files
staar_df = pd.read_csv('../data/inter/01 - district_staar_df.csv')

In [6]:
staar_df.head()

Unnamed: 0,release_year,test_year,district,grade_level,subject,proficiency,demog,numerator,denominator
0,2013,2012,1902,3,mathematics,approaches,all_students,33,42
1,2013,2012,1902,3,mathematics,approaches,at_risk,13,18
2,2013,2012,1902,3,mathematics,approaches,economic_disadvant,15,19
3,2013,2012,1902,3,mathematics,approaches,female,17,20
4,2013,2012,1902,3,mathematics,approaches,male,16,22


### PREPARATION

In [7]:
staar_df.shape

(3676553, 9)

In [8]:
staar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3676553 entries, 0 to 3676552
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   release_year  int64 
 1   test_year     int64 
 2   district      int64 
 3   grade_level   object
 4   subject       object
 5   proficiency   object
 6   demog         object
 7   numerator     int64 
 8   denominator   int64 
dtypes: int64(5), object(4)
memory usage: 252.4+ MB


In [9]:
staar_df = convert_datatype(staar_df, 'district', 'str')

In [10]:
staar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3676553 entries, 0 to 3676552
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   release_year  int64 
 1   test_year     int64 
 2   district      object
 3   grade_level   object
 4   subject       object
 5   proficiency   object
 6   demog         object
 7   numerator     int64 
 8   denominator   int64 
dtypes: int64(4), object(5)
memory usage: 252.4+ MB


In [11]:
staar_df.head()

Unnamed: 0,release_year,test_year,district,grade_level,subject,proficiency,demog,numerator,denominator
0,2013,2012,1902,3,mathematics,approaches,all_students,33,42
1,2013,2012,1902,3,mathematics,approaches,at_risk,13,18
2,2013,2012,1902,3,mathematics,approaches,economic_disadvant,15,19
3,2013,2012,1902,3,mathematics,approaches,female,17,20
4,2013,2012,1902,3,mathematics,approaches,male,16,22


In [12]:
staar_df['district'] = staar_df['district'].str.zfill(6)

In [13]:
staar_df.head()

Unnamed: 0,release_year,test_year,district,grade_level,subject,proficiency,demog,numerator,denominator
0,2013,2012,1902,3,mathematics,approaches,all_students,33,42
1,2013,2012,1902,3,mathematics,approaches,at_risk,13,18
2,2013,2012,1902,3,mathematics,approaches,economic_disadvant,15,19
3,2013,2012,1902,3,mathematics,approaches,female,17,20
4,2013,2012,1902,3,mathematics,approaches,male,16,22


In [14]:
show_values(staar_df, ['grade_level', 'subject', 'proficiency', 'demog'])

grade_level
all    1217199
eoc     517288
8       445198
5       356597
4       341308
7       337260
3       233670
6       228033
Name: grade_level, dtype: int64


subject
reading           958668
mathematics       876118
science           420222
writing           386499
social_studies    287094
all_subjects      230664
biology           109623
us_history        103351
algebra_i          81727
english_i          72967
english_ii         69679
algebra_1          31953
english_1          24430
english_2          23558
Name: subject, dtype: int64


proficiency
approaches    1860720
meets          974609
masters        841224
Name: proficiency, dtype: int64


demog
all_students          435924
female                399268
male                  398798
economic_disadvant    387014
white                 337655
at_risk               327547
hispanic              303941
special_ed            196428
contin_enrollee       177447
mobile                164811
african_american      132293
el       

### EXPLORATION

In [15]:
staar_df.head()

Unnamed: 0,release_year,test_year,district,grade_level,subject,proficiency,demog,numerator,denominator
0,2013,2012,1902,3,mathematics,approaches,all_students,33,42
1,2013,2012,1902,3,mathematics,approaches,at_risk,13,18
2,2013,2012,1902,3,mathematics,approaches,economic_disadvant,15,19
3,2013,2012,1902,3,mathematics,approaches,female,17,20
4,2013,2012,1902,3,mathematics,approaches,male,16,22


In [16]:
show_values(staar_df, ['grade_level', 'subject', 'proficiency', 'demog'])

grade_level
all    1217199
eoc     517288
8       445198
5       356597
4       341308
7       337260
3       233670
6       228033
Name: grade_level, dtype: int64


subject
reading           958668
mathematics       876118
science           420222
writing           386499
social_studies    287094
all_subjects      230664
biology           109623
us_history        103351
algebra_i          81727
english_i          72967
english_ii         69679
algebra_1          31953
english_1          24430
english_2          23558
Name: subject, dtype: int64


proficiency
approaches    1860720
meets          974609
masters        841224
Name: proficiency, dtype: int64


demog
all_students          435924
female                399268
male                  398798
economic_disadvant    387014
white                 337655
at_risk               327547
hispanic              303941
special_ed            196428
contin_enrollee       177447
mobile                164811
african_american      132293
el       

In [17]:
approaches_df = staar_df.loc[(staar_df['grade_level'] == 'all') &
                        (staar_df['subject'] == 'all_subjects') &
                        (staar_df['proficiency'] == 'approaches') &
                        (staar_df['demog'] == 'all_students')
                       ]

meets_df = staar_df.loc[(staar_df['grade_level'] == 'all') &
                        (staar_df['subject'] == 'all_subjects') &
                        (staar_df['proficiency'] == 'meets') &
                        (staar_df['demog'] == 'all_students')
                       ]

masters_df = staar_df.loc[(staar_df['grade_level'] == 'all') &
                          (staar_df['subject'] == 'all_subjects') &
                          (staar_df['proficiency'] == 'masters') &
                          (staar_df['demog'] == 'all_students')
                         ]


In [18]:
approaches_df = approaches_df.drop(columns=['grade_level', 'subject', 'proficiency', 'demog'])
meets_df = meets_df.drop(columns=['grade_level', 'subject', 'proficiency', 'demog'])
masters_df = masters_df.drop(columns=['grade_level', 'subject', 'proficiency', 'demog'])

In [19]:
approaches_df.shape
meets_df.shape
masters_df.shape

(9641, 5)

(4780, 5)

(7126, 5)

In [20]:
approaches_df['rate'] = approaches_df['numerator'] / approaches_df['denominator'] * 100
approaches_df.head()

meets_df['rate'] = meets_df['numerator'] / meets_df['denominator'] * 100
meets_df.head()

masters_df['rate'] = masters_df['numerator'] / masters_df['denominator'] * 100
masters_df.head()

Unnamed: 0,release_year,test_year,district,numerator,denominator,rate
99,2013,2012,1902,1030,1219,84.495488
372,2013,2012,1903,1796,2279,78.806494
649,2013,2012,1904,987,1243,79.404666
887,2013,2012,1906,649,762,85.170604
1186,2013,2012,1907,3817,5396,70.737583


Unnamed: 0,release_year,test_year,district,numerator,denominator,rate
120,2013,2012,1902,471,1219,38.638228
394,2013,2012,1903,697,2279,30.583589
669,2013,2012,1904,365,1243,29.364441
908,2013,2012,1906,322,762,42.257218
1208,2013,2012,1907,1212,5396,22.461082


Unnamed: 0,release_year,test_year,district,numerator,denominator,rate
110,2013,2012,1902,144,1219,11.812961
383,2013,2012,1903,203,2279,8.907416
660,2013,2012,1904,119,1243,9.573612
898,2013,2012,1906,105,762,13.779528
1197,2013,2012,1907,341,5396,6.319496


In [21]:
approaches_df.to_csv('../data/inter/staar_performance_districts_approaches.csv', index=False)
meets_df.to_csv('../data/inter/staar_performance_districts_meets.csv', index=False)
masters_df.to_csv('../data/inter/staar_performance_districts_masters.csv', index=False)