Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NCES public schools data 2020-21 upload #45941

Merged
merged 5 commits into from
Apr 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
126 changes: 126 additions & 0 deletions bin/oneoff/nces_data/import_school_stats_by_years_2020_2021_ccd
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env ruby

require_relative '../../../dashboard/config/environment'

CDO.log = Logger.new(STDOUT)

SURVEY_YEAR = '2020-2021'.freeze

DRY_RUN = true

VIRTUAL_SCHOOL_MAP = {
'Full Virtual' => 'Yes',
'Not Virtual' => 'No',
'Supplemental Virtual' => 'No',
'Virtual with face to face options' => 'Yes',
'–' => nil,
'†' => nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need these characters in the hashes?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They're in the raw data as their own values, do you have an example file, clare?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes! Here are the files in drive which explain the symbols. (I removed the extra rows at the top and bottom before uploading them to S3)

}.freeze

TITLE_I_MAP = {
'1-Title I targeted assistance eligible school-No program' => '1',
'2-Title I targeted assistance school' => '2',
'3-Title I schoolwide eligible-Title I targeted assistance program' => '3',
'4-Title I schoolwide eligible school-No program' => '4',
'5-Title I schoolwide school' => '5',
'6-Not a Title I school' => '6',
'–' => nil,
'†' => nil
}.freeze

COMMUNITY_TYPE_MAP = {
'11' => 'city_large',
'12' => 'city_midsize',
'13' => 'city_small',
'21' => 'suburban_large',
'22' => 'suburban_midsize',
'23' => 'suburban_small',
'31' => 'town_fringe',
'32' => 'town_distant',
'33' => 'town_remote',
'41' => 'rural_fringe',
'42' => 'rural_distant',
'43' => 'rural_remote'
}.freeze

GRADES_MAP = {
'Prekindergarten' => 'PK',
'Kindergarten' => 'KG',
'1st Grade' => '01',
'2nd Grade' => '02',
'3rd Grade' => '03',
'4th Grade' => '04',
'5th Grade' => '05',
'6th Grade' => '06',
'7th Grade' => '07',
'8th Grade' => '08',
'9th Grade' => '09',
'10th Grade' => '10',
'11th Grade' => '11',
'12th Grade' => '12',
'13th Grade' => '13',
'Adult Education' => 'AE',
'Ungraded' => 'UG',
'–' => 'M',
'†' => 'N'
}

# @param unsanitized [String, nil] the unsanitized string
# @returns [String, nil] the sanitized version of the string, with equal signs and double
# quotations removed. Returns nil on nil input.
def sanitize_string_for_db(unsanitized)
unsanitized&.tr('="', '')
end

# –, † .to_i will return 0

AWS::S3.process_file('cdo-nces', "#{SURVEY_YEAR}/ccd/schools_public.csv") do |filename|
SchoolStatsByYear.merge_from_csv(filename, {col_sep: ",", headers: true, quote_char: "\x00", encoding: 'UTF-8'}, dry_run: DRY_RUN) do |row|
# remove quote and eq sign from ="12345"
row = row.to_h.map {|k, v| [k, sanitize_string_for_db(v)]}.to_h

{
school_id: row['School ID - NCES Assigned [Public School] Latest available year'].to_i.to_s,
school_year: SURVEY_YEAR,
grades_offered_lo: GRADES_MAP[row['Lowest Grade Offered [Public School] 2020-21']],
grades_offered_hi: GRADES_MAP[row['Highest Grade Offered [Public School] 2020-21']],
grade_pk_offered: row['Prekindergarten offered [Public School] 2020-21'] == '1-Yes',
grade_kg_offered: row['Kindergarten offered [Public School] 2020-21'] == '1-Yes',
grade_01_offered: row['Grade 1 offered [Public School] 2020-21'] == '1-Yes',
grade_02_offered: row['Grade 2 offered [Public School] 2020-21'] == '1-Yes',
grade_03_offered: row['Grade 3 offered [Public School] 2020-21'] == '1-Yes',
grade_04_offered: row['Grade 4 offered [Public School] 2020-21'] == '1-Yes',
grade_05_offered: row['Grade 5 offered [Public School] 2020-21'] == '1-Yes',
grade_06_offered: row['Grade 6 offered [Public School] 2020-21'] == '1-Yes',
grade_07_offered: row['Grade 7 offered [Public School] 2020-21'] == '1-Yes',
grade_08_offered: row['Grade 8 offered [Public School] 2020-21'] == '1-Yes',
grade_09_offered: row['Grade 9 offered [Public School] 2020-21'] == '1-Yes',
grade_10_offered: row['Grade 10 offered [Public School] 2020-21'] == '1-Yes',
grade_11_offered: row['Grade 11 offered [Public School] 2020-21'] == '1-Yes',
grade_12_offered: row['Grade 12 offered [Public School] 2020-21'] == '1-Yes',
grade_13_offered: row['Grade 13 offered [Public School] 2020-21'] == '1-Yes',

virtual_status: VIRTUAL_SCHOOL_MAP[row['Virtual School Status (SY 2016-17 onward) [Public School] 2020-21']],
students_total: row['Total Students All Grades (Excludes AE) [Public School] 2020-21'].presence.try {|v| v.to_i == 0 ? nil : v.to_i},
student_am_count: row['American Indian/Alaska Native Students [Public School] 2020-21'].to_i,
student_as_count: row['Asian or Asian/Pacific Islander Students [Public School] 2020-21'].to_i,
student_hi_count: row['Hispanic Students [Public School] 2020-21'].to_i,
student_bl_count: row['Black or African American Students [Public School] 2020-21'].to_i,
student_wh_count: row['White Students [Public School] 2020-21'].to_i,
student_hp_count: row['Nat. Hawaiian or Other Pacific Isl. Students [Public School] 2020-21'].to_i,
student_tr_count: row['Two or More Races Students [Public School] 2020-21'].to_i,
title_i_status: TITLE_I_MAP[row['Title I School Status [Public School] 2020-21']],
frl_eligible_total: row['Free and Reduced Lunch Students [Public School] 2020-21'].to_i
}
end
end

AWS::S3.process_file('cdo-nces', "#{SURVEY_YEAR}/ccd/locale_public.csv") do |filename|
SchoolStatsByYear.merge_from_csv(filename, {col_sep: ",", headers: true, quote_char: "\x00", encoding: 'UTF-8'}, dry_run: DRY_RUN) do |row|
{
school_id: row['NCESSCH'].to_i.to_s,
school_year: SURVEY_YEAR,
community_type: COMMUNITY_TYPE_MAP[row['LOCALE']]
}
end
end
41 changes: 33 additions & 8 deletions dashboard/app/models/school.rb
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,11 @@ def self.normalize_school_id(raw_school_id)

# School statuses representing currently open schools in 2018-2019 import.
# Non-open statuses are 'Closed', 'Future', 'Inactive'
OPEN_SCHOOL_STATUSES = ['Open', 'New', 'Reopened', 'Changed Boundary/Agency', 'Added']
OPEN_SCHOOL_STATUSES_2018_2019 = ['Open', 'New', 'Reopened', 'Changed Boundary/Agency', 'Added']

# School statuses representing currently open schools in 2019-2020 import.
# School statuses representing currently open schools in 2019-2020 and 2020-2021 import.
# Non-open statuses are '2-Closed', '7-Future', '6-Inactive'
OPEN_SCHOOL_STATUSES_2019_2020 = ['1-Open', '3-New', '8-Reopened', '5-Changed Boundary/Agency', '4-Added']
OPEN_SCHOOL_STATUSES = ['1-Open', '3-New', '8-Reopened', '5-Changed Boundary/Agency', '4-Added']

# School categories need to be mapped to existing values for 2019-2020 import.
SCHOOL_CATEGORY_MAP = {
Expand Down Expand Up @@ -161,10 +161,10 @@ def self.seed_all(options = {})

def self.seed_from_s3
# NCES school data has been built up in the DB over time by pulling in different
# data files. This seeding recreates the order in which they we incorporated.
# data files. This seeding recreates the order in which they were incorporated.
# NOTE: we are intentionally not populating the state_school_id based on the
# 2014-2015 preliminary or 2013-2014 public/charter data sets. Those files
# containt duplicate entries where some schools appear to be listed more than
# contain duplicate entries where some schools appear to be listed more than
# once but with different NCES ids. Since state_school_id needs to be unique
# the seeding would fail if we tried to set the state ids from those files.
# The 2014-2015 public/charter data does not have this issue so we do load the
Expand Down Expand Up @@ -340,7 +340,7 @@ def self.seed_from_s3
# New addition for this iteration -- a "school category",
# which is Regular, Special Education, Alternative, or Career and Technical
school_category: row['SCH_TYPE_TEXT'],
last_known_school_year_open: OPEN_SCHOOL_STATUSES.include?(row['UPDATED_STATUS_TEXT']) ? '2018-2019' : nil
last_known_school_year_open: OPEN_SCHOOL_STATUSES_2018_2019.include?(row['UPDATED_STATUS_TEXT']) ? '2018-2019' : nil
}
end
end
Expand Down Expand Up @@ -378,7 +378,32 @@ def self.seed_from_s3
school_district_id: row['Agency ID - NCES Assigned [Public School] Latest available year'].to_i,
state_school_id: row['State School ID [Public School] 2019-20'],
school_category: SCHOOL_CATEGORY_MAP[row['School Type [Public School] 2019-20']].presence,
last_known_school_year_open: OPEN_SCHOOL_STATUSES_2019_2020.include?(row['Updated Status [Public School] 2019-20']) ? '2019-2020' : nil
last_known_school_year_open: OPEN_SCHOOL_STATUSES.include?(row['Updated Status [Public School] 2019-20']) ? '2019-2020' : nil
}
end
end

# Some of this data has #- appended to the front, so we strip that off with .to_s.slice(2) (it's always a single digit)
CDO.log.info "Seeding 2020-2021 public school data."
AWS::S3.seed_from_file('cdo-nces', "2020-2021/ccd/schools_public.csv") do |filename|
merge_from_csv(filename, {headers: true, quote_char: "\x00"}, true, is_dry_run: true, ignore_attributes: ['last_known_school_year_open']) do |row|
row = row.to_h.map {|k, v| [k, sanitize_string_for_db(v)]}.to_h
{
id: row['School ID - NCES Assigned [Public School] Latest available year'].to_i.to_s,
name: row['School Name'].upcase,
address_line1: row['Location Address 1 [Public School] 2020-21'].to_s.upcase.truncate(50).presence,
address_line2: row['Location Address 2 [Public School] 2020-21'].to_s.upcase.truncate(30).presence,
address_line3: row['Location Address 3 [Public School] 2020-21'].to_s.upcase.presence,
city: row['Location City [Public School] 2020-21'].to_s.upcase.presence,
state: row['Location State Abbr [Public School] 2020-21'].to_s.strip.upcase.presence,
zip: row['Location ZIP [Public School] 2020-21'],
latitude: row['Latitude [Public School] 2020-21'].to_f,
longitude: row['Longitude [Public School] 2020-21'].to_f,
school_type: CHARTER_SCHOOL_MAP[row['Charter School [Public School] 2020-21'].to_s] || 'public',
school_district_id: row['Agency ID - NCES Assigned [Public School] Latest available year'].to_i,
state_school_id: row['State School ID [Public School] 2020-21'],
school_category: SCHOOL_CATEGORY_MAP[row['School Type [Public School] 2020-21']].presence,
last_known_school_year_open: OPEN_SCHOOL_STATUSES.include?(row['Updated Status [Public School] 2020-21']) ? '2020-2021' : nil
}
end
end
Expand Down Expand Up @@ -524,7 +549,7 @@ def self.merge_from_csv(filename, options = CSV_IMPORT_OPTIONS, update_existing
if duplicate_schools.any?
summary_message <<
"Duplicate schools#{future_tense_dry_run} skipped:\n"\
"#{pretty_print_schools_list(duplicate_schools)}"
"#{pretty_print_school_list(duplicate_schools)}"
end
end

Expand Down
15 changes: 15 additions & 0 deletions dashboard/app/models/school_district.rb
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,21 @@ def self.seed_from_s3
}
end
end

CDO.log.info "Seeding 2020-2021 school district data"
import_options_2021 = {col_sep: ",", headers: true, quote_char: "\x00"}
AWS::S3.seed_from_file('cdo-nces', "2020-2021/ccd/district.csv") do |filename|
SchoolDistrict.merge_from_csv(filename, import_options_2021, true, is_dry_run: true, ignore_attributes: ['last_known_school_year_open']) do |row|
{
id: row['Agency ID - NCES Assigned [District] Latest available year'].tr('"=', '').to_i,
name: row['Agency Name'].upcase,
city: row['Location City [District] 2020-21'].to_s.upcase.presence,
state: row['Location State Abbr [District] 2020-21'].strip.to_s.upcase.presence,
zip: row['Location ZIP [District] 2020-21'].tr('"=', ''),
last_known_school_year_open: OPEN_SCHOOL_STATUSES.include?(row['Updated Status [District] 2020-21']) ? '2020-2021' : nil
}
end
end
end
end

Expand Down