# Stage 2 - Preparing Visualization Dataset

## Setup

In [1]:
#Setup (Importing Libraries) 
import csv #to read in csv files
import pandas as pd #for data wrangling
import numpy as np  #for mathematical operations
import altair as alt #for data visualization

## CIP Course Data

In [2]:
#Importing and Saving Student Results for CIP Courses
#Note: this is where you will want to change the file name for the new CIP Student Results Dataset
cip_courses  = pd.read_csv("data/labelled_data/CIP_Data/cip_course_statistics_2017.csv")

#Reshaping (Spreading) - lettergrades are becoming columns
cip_courses = pd.pivot_table(cip_courses, index = ['DistrictCode','DistrictName','SchoolCode','SchoolName','term','cipcode','courseTitle', 'cs_course'], columns = 'letterGrade', values = 'count')

#Fill NA for Letter Grades to 0
cip_courses = cip_courses.fillna(0)

#Strip the extra space at the start and end of column names
cip_courses.columns = cip_courses.columns.str.rstrip()

#List of Column Names
cols = ['A', 'A-', 'B', 'B+', 'B-', 'C','C+', 'C-', 'CR', 'D', 'D+', 'E', 'F', 'N', 'NC', 'P', 'S', 'U', 'W']

#Convert Columns in List to Integer DataType
cip_courses[cols] = cip_courses[cols].applymap(np.int64)

#Adding Column Stating Total Students in Course
cip_courses['total_students'] = cip_courses['A'] + cip_courses['A-'] + cip_courses['B'] + cip_courses['B+']+ cip_courses['B-']+ cip_courses['C']+ cip_courses['C-']+ cip_courses['C+']+ cip_courses['CR']+ cip_courses['D']+ cip_courses['D+']+ cip_courses['E']+ cip_courses['F']+ cip_courses['N']+ cip_courses['NC']+ cip_courses['P']+ cip_courses['S']+ cip_courses['U']+ cip_courses['W']                

#Resetting Index of Dataframe
cip_courses = cip_courses.reset_index()

#Showing the head of the dataframe
cip_courses.head()

letterGrade,DistrictCode,DistrictName,SchoolCode,SchoolName,term,cipcode,courseTitle,cs_course,A,A-,...,D+,E,F,N,NC,P,S,U,W,total_students
0,1147,Othello School District,3015,Othello High School,SEM2,110801,DIGITAL DESIGN,no,27,5,...,0,0,3,0,0,0,0,0,0,60
1,1158,Lind School District,2903,Lind-Ritzville High School,SEM1,110103,TECHNOLOGY 1A,no,4,2,...,0,0,0,0,0,0,0,0,0,12
2,1158,Lind School District,2903,Lind-Ritzville High School,SEM1,110801,PHOTOGRAPHY,no,2,1,...,0,0,2,0,0,0,0,0,0,7
3,1158,Lind School District,2903,Lind-Ritzville High School,SEM2,110103,TECHNOLOGY 1B,no,3,0,...,0,0,3,0,0,0,0,0,0,10
4,1158,Lind School District,2903,Lind-Ritzville High School,SEM2,110801,PHOTOGRAPHY,no,1,1,...,0,0,0,0,0,0,0,0,0,5


# State Course Code Data

In [3]:
#Importing and Saving Student Results for State Courses
#Note: this is where you will want to change the file name for the new SCC Student Results Dataset
scc_courses = pd.read_csv("data/labelled_data/State_Course_Code_Data/state_course_code_statistics_2017.csv")

#Reshaping (Spreading) - lettergrades are becoming columns
scc_courses = pd.pivot_table(scc_courses, index = ['DistrictCode','DistrictName','SchoolCode','SchoolName','term','stateCourseCodeId','courseTitle', 'cs_course'], columns = 'letterGrade', values = 'count')

#Fill NA for Letter Grades to 0
scc_courses = scc_courses.fillna(0)

#Strip the extra space at the start and end of column names
scc_courses.columns = scc_courses.columns.str.rstrip()

#List of Column Names
cols = ['A', 'A-', 'B', 'B+', 'B-', 'C','C+', 'C-', 'CR', 'D', 'D+', 'E', 'F', 'N', 'NC', 'P', 'S', 'U', 'W']

#Convert Columns in List to Integer DataType
scc_courses[cols] = scc_courses[cols].applymap(np.int64)

#Adding Column Stating Total Students in Course
scc_courses['total_students'] = scc_courses['A'] + scc_courses['A-'] + scc_courses['B'] + scc_courses['B+']+ scc_courses['B-']+ scc_courses['C']+ scc_courses['C-']+ scc_courses['C+']+ scc_courses['CR']+ scc_courses['D']+ scc_courses['D+']+ scc_courses['E']+ scc_courses['F']+ scc_courses['N']+ scc_courses['NC']+ scc_courses['P']+ scc_courses['S']+ scc_courses['U']+ scc_courses['W']                

#Resetting Index of Dataframe
scc_courses = scc_courses.reset_index()

#Showing the head of the dataframe
scc_courses.head()

letterGrade,DistrictCode,DistrictName,SchoolCode,SchoolName,term,stateCourseCodeId,courseTitle,cs_course,A,A-,...,D+,E,F,N,NC,P,S,U,W,total_students
0,1109,Washtucna School District,3075,Washtucna Elementary/High School,SEM1,2309,10 ENGLISH,no,1,0,...,0,0,0,0,0,0,0,0,0,3
1,1109,Washtucna School District,3075,Washtucna Elementary/High School,SEM2,2309,10 ENGLISH,no,3,0,...,0,0,0,0,0,0,0,0,0,5
2,1147,Othello School District,3015,Othello High School,SEM1,2696,DIGITOOLS,no,4,1,...,2,0,3,0,0,0,0,0,0,28
3,1147,Othello School District,3015,Othello High School,SEM1,2696,DIGITOOLS C/D,no,11,1,...,1,0,1,0,0,0,0,0,2,25
4,1147,Othello School District,3015,Othello High School,SEM2,2696,DIGITOOLS,no,11,4,...,2,0,4,0,0,0,0,0,2,53


## Combining CIP Course Data and State Course Data

In [4]:
#Combining the State Course and CIP Course Data
all_courses = pd.concat([cip_courses, scc_courses], sort=False)

#Printing the head of the table
all_courses.describe()

Unnamed: 0,DistrictCode,SchoolCode,cipcode,A,A-,B,B+,B-,C,C+,...,E,F,N,NC,P,S,U,W,total_students,stateCourseCodeId
count,3874.0,3874.0,1269.0,3874.0,3874.0,3874.0,3874.0,3874.0,3874.0,3874.0,...,3874.0,3874.0,3874.0,3874.0,3874.0,3874.0,3874.0,3874.0,3874.0,2605.0
mean,21253.530718,3380.86603,127621.256107,10.060145,2.066598,2.647393,1.496128,1.310532,1.618998,0.926174,...,0.215023,1.660299,0.01652,0.251678,0.228962,0.035364,0.011874,0.564275,25.853123,2350.615739
std,10263.742969,1034.285193,75876.074537,14.525946,3.76528,4.210886,2.884042,2.515402,2.994859,1.864842,...,1.704771,4.491175,0.254544,3.201053,1.236119,0.554342,0.221171,1.591717,34.093754,761.470826
min,1109.0,1500.0,110103.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,795.0
25%,17001.0,2488.0,110201.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,2494.0
50%,18402.0,3247.0,110701.0,6.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,2700.0
75%,31006.0,4128.0,110801.0,12.0,3.0,3.0,2.0,2.0,2.0,1.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,29.75,2713.0
max,39209.0,5961.0,470104.0,175.0,70.0,62.0,43.0,50.0,32.0,31.0,...,41.0,69.0,8.0,88.0,25.0,26.0,11.0,30.0,485.0,3055.0


In [5]:
all_courses_lite = all_courses.copy()
all_courses_lite = all_courses_lite[["DistrictCode","SchoolCode", "SchoolName", "courseTitle", "cs_course", "total_students"]]
all_courses_lite.head()

Unnamed: 0,DistrictCode,SchoolCode,SchoolName,courseTitle,cs_course,total_students
0,1147,3015,Othello High School,DIGITAL DESIGN,no,60
1,1158,2903,Lind-Ritzville High School,TECHNOLOGY 1A,no,12
2,1158,2903,Lind-Ritzville High School,PHOTOGRAPHY,no,7
3,1158,2903,Lind-Ritzville High School,TECHNOLOGY 1B,no,10
4,1158,2903,Lind-Ritzville High School,PHOTOGRAPHY,no,5


## Listing Schools on which we have course data

In [6]:
known_schools = all_courses_lite.copy()
known_schools = known_schools[["SchoolName", "DistrictCode", "SchoolCode"]]
known_schools = known_schools.drop_duplicates(['SchoolCode'])
known_schools = known_schools.reset_index()
known_schools = known_schools[["SchoolName", "DistrictCode", "SchoolCode"]]
known_schools.head()

Unnamed: 0,SchoolName,DistrictCode,SchoolCode
0,Othello High School,1147,3015
1,Lind-Ritzville High School,1158,2903
2,Ritzville High School,1160,2132
3,Asotin Jr Sr High,2420,2434
4,Mid-Columbia Parent Partnership,3017,1941


## Listing Schools which teach computer science

In [7]:
# Filtering to keep only Computer Science courses
cs_results = all_courses.loc[all_courses["cs_course"] == "yes"]

# Grouping by high school and summarizing for the count of computer science classes taught
cs_schools = cs_results.groupby(['SchoolCode','SchoolName']).agg({'cs_course': 'count', 'total_students': 'sum'})

# Adding Column to say School Teaches Computer Science
cs_schools["school_teaches_cs"] = "Teaches Computer Science"

# Resetting the Index after grouping by
cs_schools = cs_schools.reset_index()

# Renaming column to state total computer science courses taught in that year
cs_schools = cs_schools.rename(columns = {'cs_course': 'total_cs_courses', 'total_students': 'yearly_enrolled_in_cs'})

# Printing head of schools which teach computer science dataframe
cs_schools.head()

Unnamed: 0,SchoolCode,SchoolName,total_cs_courses,yearly_enrolled_in_cs,school_teaches_cs
0,1519,Edmonds eLearning Academy,1,1,Teaches Computer Science
1,1547,Middle College High School,4,58,Teaches Computer Science
2,1627,Yelm Extension School,2,3,Teaches Computer Science
3,1628,Dishman Hills High School,6,158,Teaches Computer Science
4,1640,Puyallup Online Academy/POA,2,6,Teaches Computer Science


## Listing Schoools which don't teach computer science

In [8]:
non_cs_schools = known_schools[~known_schools.SchoolCode.isin(cs_schools.SchoolCode)]

#Adding column to say it teaches 0 cs courses
non_cs_schools["total_cs_courses"] = 0

#Adding column to say it has 0 students enrolled in CS
non_cs_schools["yearly_enrolled_in_cs"] = 0

#Adding column to say that School does not teache CS
non_cs_schools["school_teaches_cs"] = "Doesn't Teach Computer Science"

#Resetting Index
non_cs_schools = non_cs_schools.reset_index()

#Selecting columns to keep
non_cs_schools = non_cs_schools[['SchoolName','SchoolCode', 'total_cs_courses', 'yearly_enrolled_in_cs', "school_teaches_cs"]]

#Printing schools which do not teach CS
non_cs_schools.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,SchoolName,SchoolCode,total_cs_courses,yearly_enrolled_in_cs,school_teaches_cs
0,Othello High School,3015,0,0,Doesn't Teach Computer Science
1,Lind-Ritzville High School,2903,0,0,Doesn't Teach Computer Science
2,Prosser High School,2508,0,0,Doesn't Teach Computer Science
3,Richland High School,3511,0,0,Doesn't Teach Computer Science
4,Entiat Middle and High School,3317,0,0,Doesn't Teach Computer Science


## Listing Statistics of Known Schools

In [9]:
#Adding CS and Non CS Schools Data Frames
known_schools_stats = pd.concat([cs_schools, non_cs_schools]) 

#Resetting Index
known_schools_stats = known_schools_stats.reset_index()

#Selecting columns to keep
known_schools_stats = known_schools_stats[['SchoolName','SchoolCode', 'total_cs_courses', 'yearly_enrolled_in_cs', "school_teaches_cs"]]

known_schools_stats.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


Unnamed: 0,SchoolName,SchoolCode,total_cs_courses,yearly_enrolled_in_cs,school_teaches_cs
0,Edmonds eLearning Academy,1519,1,1,Teaches Computer Science
1,Middle College High School,1547,4,58,Teaches Computer Science
2,Yelm Extension School,1627,2,3,Teaches Computer Science
3,Dishman Hills High School,1628,6,158,Teaches Computer Science
4,Puyallup Online Academy/POA,1640,2,6,Teaches Computer Science


## List of All High Schools in Washington State

In [10]:
high_schools = pd.read_csv("data/labelled_data/School_Data/High_Schools_WA_Information.csv")
high_schools.head()

Unnamed: 0,LEACode,LEAName,SchoolCode,SchoolName,LowestGrade,HighestGrade,PrincipalName,Email,Phone,OrgCategoryList,GradeCategory,City
0,38300,Colfax School District,3366,Colfax High School,7,12,David Gibb,david.gibb@csd300.com,509.830.2347,"Public School, Regular School",High School,Colfax
1,38301,Palouse School District,2634,Palouse High School,9,12,Mike Jones,mjones@garpal.net,509.878.1921,"Public School, Regular School",High School,Palouse
2,38306,Colton School District,2588,Colton School,PK,12,Tim Casey,tcasey@colton.k12.wa.us,509.229.3386,"Public School, Regular School",PK-12,Colton
3,38320,Rosalia School District,3204,Rosalia Elementary & Secondary School,PK,12,Matthew McLain,mmclain@rosaliaschools.org,509.523.3061,"Public School, Regular School",PK-12,Rosalia
4,38322,St. John School District,3068,St John/Endicott High,9,12,Mark Purvine,mpurvine@stjohn.wednet.edu,509.648.3336,"Public School, Regular School",High School,Saint John


## Listing Statistics of Known HIGH Schools

In [11]:
known_high_school_stats = known_schools_stats.copy()
known_high_school_stats = known_high_school_stats[known_high_school_stats.SchoolCode.isin(high_schools.SchoolCode)]
known_high_school_stats = known_high_school_stats.reset_index()
known_high_school_stats = known_high_school_stats[['SchoolName', 'SchoolCode', 'total_cs_courses',
       'yearly_enrolled_in_cs', 'school_teaches_cs']]
known_high_school_stats.head()

Unnamed: 0,SchoolName,SchoolCode,total_cs_courses,yearly_enrolled_in_cs,school_teaches_cs
0,Edmonds eLearning Academy,1519,1,1,Teaches Computer Science
1,Middle College High School,1547,4,58,Teaches Computer Science
2,Yelm Extension School,1627,2,3,Teaches Computer Science
3,Dishman Hills High School,1628,6,158,Teaches Computer Science
4,Puyallup Online Academy/POA,1640,2,6,Teaches Computer Science


## Listing High Schools we do not have statistics on

In [12]:
unknown_high_schools = high_schools.copy()
unknown_high_schools = unknown_high_schools[~unknown_high_schools.SchoolCode.isin(known_high_school_stats.SchoolCode)]
unknown_high_schools = unknown_high_schools[["SchoolCode", "SchoolName"]]
unknown_high_schools["total_cs_courses"] = np.nan
unknown_high_schools["yearly_enrolled_in_cs"] = np.nan
unknown_high_schools["school_teaches_cs"] = "No Information Available"
unknown_high_schools = unknown_high_schools.reset_index()
unknown_high_schools = unknown_high_schools[['SchoolCode', 'SchoolName', 'total_cs_courses','yearly_enrolled_in_cs', 'school_teaches_cs']]
unknown_high_schools

Unnamed: 0,SchoolCode,SchoolName,total_cs_courses,yearly_enrolled_in_cs,school_teaches_cs
0,3204,Rosalia Elementary & Secondary School,,,No Information Available
1,4040,West Valley Jr High,,,No Information Available
2,1910,Marysville SD Special,,,No Information Available
3,1904,Parent Partnership,,,No Information Available
4,1932,Columbia Virtual Academy,,,No Information Available
5,1934,Loowit High School,,,No Information Available
6,1926,Home Choice Academy,,,No Information Available
7,1925,Trojan Alternative School,,,No Information Available
8,3507,Structural Alt Confinement School,,,No Information Available
9,1900,Bridgeport Aurora High School,,,No Information Available


## Combining High Schools we have statistics on with High Schools we do not have statistics on

In [13]:
all_high_school_stats = pd.concat([known_high_school_stats, unknown_high_schools], sort=False)
all_high_school_stats = all_high_school_stats.reset_index()
all_high_school_stats = all_high_school_stats[['SchoolCode', 'SchoolName', 'total_cs_courses','yearly_enrolled_in_cs', 'school_teaches_cs']]
all_high_school_stats.head()

Unnamed: 0,SchoolCode,SchoolName,total_cs_courses,yearly_enrolled_in_cs,school_teaches_cs
0,1519,Edmonds eLearning Academy,1.0,1.0,Teaches Computer Science
1,1547,Middle College High School,4.0,58.0,Teaches Computer Science
2,1627,Yelm Extension School,2.0,3.0,Teaches Computer Science
3,1628,Dishman Hills High School,6.0,158.0,Teaches Computer Science
4,1640,Puyallup Online Academy/POA,2.0,6.0,Teaches Computer Science


## Combining High School Statistics with School Information

In [20]:
all_high_school_stats_and_info = pd.merge(all_high_school_stats, high_schools, how = 'outer', on = 'SchoolCode')

all_high_school_stats_and_info["SchoolName_x"] = all_high_school_stats_and_info["SchoolName_x"].str.title()
all_high_school_stats_and_info["SchoolName_y"] = all_high_school_stats_and_info["SchoolName_y"].str.title()
all_high_school_stats_and_info["PrincipalName"] = all_high_school_stats_and_info["PrincipalName"].str.title()
all_high_school_stats_and_info["Email"] = all_high_school_stats_and_info["Email"].str.lower()
all_high_school_stats_and_info["City"] = all_high_school_stats_and_info["City"].str.title()

all_high_school_stats_and_info.head()

all_high_school_stats_and_info.to_csv("All_High_School_Statistics_And_Information.csv")

## Listing Schools in Washington for which we have Geo-Data

In [28]:
wa_school_geo_data = pd.read_csv("data/labelled_data/School_Data/WA_K12_Schools_Geo_Data.csv")
wa_school_geo_data.head()

Unnamed: 0,X,Y,FID,SchoolCode,Latitude,Longitude,ESDCode,ESDName,LEACode,LEAName,...,City,State,ZipCode,PrincipalN,Email,Phone,OrgCategor,AYPCode,GradeCateg,OrgCateg_1
0,-119.195783,46.224367,2001,4007,46.224373,-119.195797,11801,Educational Service District 123,3017,Kennewick School District,...,KENNEWICK,Washington,99336-1300,Dennis Boatman,dennis.boatman@ksd.org,509.222.6522,Detention Center,J,Other,Public
1,-122.354845,47.211844,2002,5549,47.21185,-122.35486,OSPI,Office of Superintendent of Public Instruction,27901,Chief Leschi Tribal Compact,...,Puyallup,Washington,98371,Bruce Leonardy,bruce.leonardy@leschischools.org,253.445.6000,"Not Affiliated With District, Tribal School",Q,K-12,Tribal
2,-122.460763,45.593231,2003,5534,45.593237,-122.460777,06801,Educational Service District 112,6117,Camas School District,...,Camas,Washington,98607,Aaron J Smith,aaronj.smith@camas.wednet.edu,360-833-5780,"Affiliated With District, Public School",P,Middle School,Public
3,-117.558706,47.808964,2004,5417,47.80897,-117.55872,32801,Educational Service District 101,32325,Nine Mile Falls School District,...,Nine Mile Falls,Washington,99026,Willard B Osborn,bosborn@9mile.org,509.340.4200,"Public School, Re-Engagement School",R,High School,Public
4,-122.917265,46.994554,2005,5305,46.99456,-122.91728,OSPI,Office of Superintendent of Public Instruction,34801,Capital Region ESD 113,...,Tumwater,Washington,98512,Gerald Grubbs,ggrubbs@esd113.org,360.927.6232,"Public School, Re-Engagement School",R,High School,Public


## Merging GeoData to List of School Statistics and Information

In [31]:
wa_high_school_stats_info_with_geo = pd.merge(all_high_school_stats_and_info,wa_school_geo_data, how = 'left', on = 'SchoolCode')
#wa_high_school_stats_info_with_geo = wa_high_school_stats_info_with_geo.drop(columns=['SchoolName_x'])

wa_high_school_stats_info_with_geo.columns

Index(['SchoolCode', 'SchoolName_x', 'total_cs_courses',
       'yearly_enrolled_in_cs', 'school_teaches_cs', 'LEACode_x', 'LEAName_x',
       'SchoolName_y', 'LowestGrade', 'HighestGrade', 'PrincipalName',
       'Email_x', 'Phone_x', 'OrgCategoryList', 'GradeCategory', 'City_x', 'X',
       'Y', 'FID', 'Latitude', 'Longitude', 'ESDCode', 'ESDName', 'LEACode_y',
       'LEAName_y', 'SchoolName', 'LowestGrad', 'HighestGra', 'AddressLin',
       'AddressL_1', 'City_y', 'State', 'ZipCode', 'PrincipalN', 'Email_y',
       'Phone_y', 'OrgCategor', 'AYPCode', 'GradeCateg', 'OrgCateg_1'],
      dtype='object')

In [None]:
#wa_high_school_stats_info_with_geo = wa_high_school_stats_info_with_geo.rename(columns = {'SchoolCode': 'School_Code',
#                                                                                         'total_cs_courses':'Total_CS_Courses',
 #                                                                                        'yearly_enrolled_in_cs': 'Yearly_Enrolled_In_CS',
  #                                                                                       'school_teaches_cs': 'School_Teaches_CS'})


In [19]:
wa_high_school_stats_info_with_geo.to_csv("all_data_latest.csv")