In [1]:
#set up
import requests
import re
import csv
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
#use beautiful soup to extract all states listed on 2020 Census - School District Reference Map website
census_state_url = 'https://www2.census.gov/geo/maps/DC2020/PL20/'
census_state_html_text = requests.get(census_state_url).text
soup = BeautifulSoup(census_state_html_text, 'html.parser')
states_code = []
for link in soup.find_all('a'):
    href = link.get('href')
    pattern = "^st\d+_.{2}/$"
    found = re.search(pattern, str(href))
    if found:
        states_code.append(href[:-1])

In [4]:
#save all states to a list
states_list = []
for each_state in states_code[:51]:
    states_list.append(each_state.split('_')[1].upper())

In [6]:
#from each state school district rating csv files import data into a combined df called df_grading_original
df_grading_original = pd.DataFrame(columns = ['State','MuiTypography-root 2','MuiTypography-root 4','niche__grade'])
for each_state in states_list:
    print ("working on " + each_state)
    data = pd.read_csv ('niche/'+each_state+'.csv')   
    df = pd.DataFrame(data)
    df = df[['MuiTypography-root 2','MuiTypography-root 4','niche__grade']]
    df['State'] = each_state
    df_grading_original = pd.concat([df_grading_original,df],ignore_index=True)

working on AL
working on AK
working on AZ
working on AR
working on CA
working on CO
working on CT
working on DE
working on DC
working on FL
working on GA
working on HI
working on ID
working on IL
working on IN
working on IA
working on KS
working on KY
working on LA
working on ME
working on MD
working on MA
working on MI
working on MN
working on MS
working on MO
working on MT
working on NE
working on NV
working on NH
working on NJ
working on NM
working on NY
working on NC
working on ND
working on OH
working on OK
working on OR
working on PA
working on RI
working on SC
working on SD
working on TN
working on TX
working on UT
working on VT
working on VA
working on WA
working on WV
working on WI
working on WY


In [7]:
#df clean-up to keep necessary columns
df_grading_original = df_grading_original[df_grading_original['MuiTypography-root 4'] == 'School District']
df_grading_original = df_grading_original[df_grading_original['niche__grade'] != 'unavailable']
df_grading_original = df_grading_original[['State','MuiTypography-root 2', 'niche__grade']]
df_grading_original = df_grading_original.rename(columns={"MuiTypography-root 2": "School_district", "niche__grade": "Niche_grade"})
df_grading_original

Unnamed: 0,State,School_district,Niche_grade
0,AL,Madison City School District,A+
1,AL,Homewood City School District,A+
2,AL,Auburn City School District,A+
4,AL,Vestavia Hills City School District,A+
5,AL,Hoover City School District,A+
...,...,...,...
13808,WY,Sweetwater County School District 1,C
13809,WY,Niobrara County School District 1,C minus
13810,WY,Fremont County School District 21,C minus
13811,WY,Fremont County School District 38,C minus


In [8]:
#covert letter grade to score step 1/3
grade_set = set()
grade_list = df_grading_original.Niche_grade.values.tolist()
for i in grade_list:
    grade_set.add(i)
grade_set

{'A',
 'A minus',
 'A+',
 'B',
 'B minus',
 'B+',
 'C',
 'C minus',
 'C+',
 'D',
 'D minus',
 'D+'}

In [9]:
#covert letter grade to score step 2/3
mapping = {'A+': (97+100)/2, 'A': (93+96)/2, 'A minus': (90+92)/2, 'B+': (87+89)/2, 'B': (83+86)/2, 'B minus': (80+82)/2, 'C+': (77+79)/2, 'C': (73+76)/2, 'C minus': (70+72)/2, 'D+': (67+69)/2, 'D': (65+66)/2, 'D minus': 65}

In [10]:
#covert letter grade to score step 3/3
df_grading_original['Niche_grade_coverted_to_score'] = df_grading_original.Niche_grade
df_grading_original = df_grading_original.replace({'Niche_grade_coverted_to_score': mapping})
df_grading_original

Unnamed: 0,State,School_district,Niche_grade,Niche_grade_coverted_to_score
0,AL,Madison City School District,A+,98.5
1,AL,Homewood City School District,A+,98.5
2,AL,Auburn City School District,A+,98.5
4,AL,Vestavia Hills City School District,A+,98.5
5,AL,Hoover City School District,A+,98.5
...,...,...,...,...
13808,WY,Sweetwater County School District 1,C,74.5
13809,WY,Niobrara County School District 1,C minus,71.0
13810,WY,Fremont County School District 21,C minus,71.0
13811,WY,Fremont County School District 38,C minus,71.0


In [11]:
#import school district data into df as df_state_county and make a copy of df_grading_original as df_grading
df_state_county = pd.read_csv ('final_school_district.csv')
df_grading = df_grading_original.copy()

In [12]:
#clearn the two df for all states
df_state_county['School_district'] = df_state_county['School_district'].str.replace(r'\s+\d+-\d$', '', regex=True)
df_state_county['School_district'] = df_state_county['School_district'].str.replace('-', ' ')

df_grading['School_district'] = df_grading['School_district'].str.replace('-', ' ')

In [13]:
#clean data for KS
df_state_county.loc[df_state_county.State == 'KS', 'School_district'] = df_state_county.loc[df_state_county.State == 'KS', 'School_district'].str.replace(r'\s+\d+$', '', regex=True)
df_state_county.loc[df_state_county.State == 'KS', 'School_district'] = df_state_county.loc[df_state_county.State == 'KS', 'School_district'].str.replace(' Schools Unified School District', '')
df_state_county.loc[df_state_county.State == 'KS', 'School_district'] = df_state_county.loc[df_state_county.State == 'KS', 'School_district'].str.replace(' School Unified School District', '')
df_state_county.loc[df_state_county.State == 'KS', 'School_district'] = df_state_county.loc[df_state_county.State == 'KS', 'School_district'].str.replace(' Unified School District', '')
df_state_county.loc[df_state_county.State == 'KS', 'School_district'] = df_state_county.loc[df_state_county.State == 'KS', 'School_district'].str.replace(' Public School', '')
df_state_county.loc[df_state_county.State == 'KS', 'School_district'] = df_state_county.loc[df_state_county.State == 'KS', 'School_district'].str.replace(' Public Schools', '')
df_state_county.loc[df_state_county.State == 'KS', 'School_district'] = df_state_county.loc[df_state_county.State == 'KS', 'School_district'].str.replace(' Public', '')
df_state_county.loc[df_state_county.State == 'KS', 'School_district'] = df_state_county.loc[df_state_county.State == 'KS', 'School_district'].str.replace(' Schools', '')
df_state_county.loc[df_state_county.State == 'KS', 'School_district'] = df_state_county.loc[df_state_county.State == 'KS', 'School_district'].str.replace(' School', '')
df_grading.loc[df_grading.State == 'KS', 'School_district'] = df_grading.loc[df_grading.State == 'KS', 'School_district'].str.replace(' Schools Unified School District', '')
df_grading.loc[df_grading.State == 'KS', 'School_district'] = df_grading.loc[df_grading.State == 'KS', 'School_district'].str.replace(' School Unified School District', '')
df_grading.loc[df_grading.State == 'KS', 'School_district'] = df_grading.loc[df_grading.State == 'KS', 'School_district'].str.replace(' Unified School District', '')
df_grading.loc[df_grading.State == 'KS', 'School_district'] = df_grading.loc[df_grading.State == 'KS', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'KS', 'School_district'] = df_grading.loc[df_grading.State == 'KS', 'School_district'].str.replace(' Public School', '')
df_grading.loc[df_grading.State == 'KS', 'School_district'] = df_grading.loc[df_grading.State == 'KS', 'School_district'].str.replace(' Public', '')
df_grading.loc[df_grading.State == 'KS', 'School_district'] = df_grading.loc[df_grading.State == 'KS', 'School_district'].str.replace(' Schools', '')
df_grading.loc[df_grading.State == 'KS', 'School_district'] = df_grading.loc[df_grading.State == 'KS', 'School_district'].str.replace(' School', '')

In [14]:
#clean data for DC
df_grading.loc[df_grading.State == 'DC', 'School_district'] = 'District of Columbia Public Schools'

In [15]:
#clean data for GA
df_state_county.loc[df_state_county.State == 'GA', 'School_district'] = df_state_county.loc[df_state_county.State == 'GA', 'School_district'].str.replace(' County School District', '')
df_state_county.loc[df_state_county.State == 'GA', 'School_district'] = df_state_county.loc[df_state_county.State == 'GA', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'GA', 'School_district'] = df_state_county.loc[df_state_county.State == 'GA', 'School_district'].str.replace(' County Schools', '')
df_state_county.loc[df_state_county.State == 'GA', 'School_district'] = df_state_county.loc[df_state_county.State == 'GA', 'School_district'].str.replace(' County', '')
df_state_county.loc[df_state_county.State == 'GA', 'School_district'] = df_state_county.loc[df_state_county.State == 'GA', 'School_district'].str.replace(' Schools', '')
df_state_county.loc[df_state_county.State == 'GA', 'School_district'] = df_state_county.loc[df_state_county.State == 'GA', 'School_district'].str.replace('Atlanta City', 'Atlanta')
df_state_county.loc[df_state_county.State == 'GA', 'School_district'] = df_state_county.loc[df_state_county.State == 'GA', 'School_district'].str.replace('Decatur City', 'City of Decatur')
df_state_county.loc[df_state_county.State == 'GA', 'School_district'] = df_state_county.loc[df_state_county.State == 'GA', 'School_district'].str.replace('DeKalb', 'Dekalb')

df_grading.loc[df_grading.State == 'GA', 'School_district'] = df_grading.loc[df_grading.State == 'GA', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'GA', 'School_district'] = df_grading.loc[df_grading.State == 'GA', 'School_district'].str.replace(' County School System', '')
df_grading.loc[df_grading.State == 'GA', 'School_district'] = df_grading.loc[df_grading.State == 'GA', 'School_district'].str.replace(' County School Districtt', '')
df_grading.loc[df_grading.State == 'GA', 'School_district'] = df_grading.loc[df_grading.State == 'GA', 'School_district'].str.replace(' County Public Schools', '')
df_grading.loc[df_grading.State == 'GA', 'School_district'] = df_grading.loc[df_grading.State == 'GA', 'School_district'].str.replace(' School District', '')
df_grading.loc[df_grading.State == 'GA', 'School_district'] = df_grading.loc[df_grading.State == 'GA', 'School_district'].str.replace(' County Schools', '')
df_grading.loc[df_grading.State == 'GA', 'School_district'] = df_grading.loc[df_grading.State == 'GA', 'School_district'].str.replace(' County', '')
df_grading.loc[df_grading.State == 'GA', 'School_district'] = df_grading.loc[df_grading.State == 'GA', 'School_district'].str.replace(' Schools', '')
df_grading.loc[df_grading.State == 'GA', 'School_district'] = df_grading.loc[df_grading.State == 'GA', 'School_district'].str.replace(' Public', '')
df_grading.loc[df_grading.State == 'GA', 'School_district'] = df_grading.loc[df_grading.State == 'GA', 'School_district'].str.replace(' School System', '')

In [16]:
#clean data for KY
df_state_county.loc[df_state_county.State == 'KY', 'School_district'] = df_state_county.loc[df_state_county.State == 'KY', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'KY', 'School_district'] = df_state_county.loc[df_state_county.State == 'KY', 'School_district'].str.replace(' Schools District', '')
df_grading.loc[df_grading.State == 'KY', 'School_district'] = df_grading.loc[df_grading.State == 'KY', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'KY', 'School_district'] = df_grading.loc[df_grading.State == 'KY', 'School_district'].str.replace(' Schools', '')
df_grading.loc[df_grading.State == 'KY', 'School_district'] = df_grading.loc[df_grading.State == 'KY', 'School_district'].str.replace(' in', ' Independent')

In [17]:
#clean data for LA
df_state_county.loc[df_state_county.State == 'LA', 'School_district'] = df_state_county.loc[df_state_county.State == 'LA', 'School_district'].str.replace(' Schools District', '')
df_state_county.loc[df_state_county.State == 'LA', 'School_district'] = df_state_county.loc[df_state_county.State == 'LA', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'LA', 'School_district'] = df_state_county.loc[df_state_county.State == 'LA', 'School_district'].str.replace('De Soto', 'DeSoto')
df_state_county.loc[df_state_county.State == 'LA', 'School_district'] = df_state_county.loc[df_state_county.State == 'LA', 'School_district'].str.replace('La Salle', 'Lasalle')

df_grading.loc[df_grading.State == 'LA', 'School_district'] = df_grading.loc[df_grading.State == 'LA', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'LA', 'School_district'] = df_grading.loc[df_grading.State == 'LA', 'School_district'].str.replace(' Public School', '')
df_grading.loc[df_grading.State == 'LA', 'School_district'] = df_grading.loc[df_grading.State == 'LA', 'School_district'].str.replace('City of Baker School District', 'Baker City')

In [18]:
#clean data for WV
df_state_county.loc[df_state_county.State == 'WV', 'School_district'] = df_state_county.loc[df_state_county.State == 'WV', 'School_district'].str.replace(' Schools District', '')
df_state_county.loc[df_state_county.State == 'WV', 'School_district'] = df_state_county.loc[df_state_county.State == 'WV', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'WV', 'School_district'] = df_state_county.loc[df_state_county.State == 'WV', 'School_district'].str.replace('Monongalia', 'Monongalia County')

df_grading.loc[df_grading.State == 'WV', 'School_district'] = df_grading.loc[df_grading.State == 'WV', 'School_district'].str.replace(' Schools', '')
df_grading.loc[df_grading.State == 'WV', 'School_district'] = df_grading.loc[df_grading.State == 'WV', 'School_district'].str.replace(' School', '')

In [19]:
#clean data for ID
df_state_county.loc[df_state_county.State == 'ID', 'School_district'] = df_state_county.loc[df_state_county.State == 'ID', 'School_district'].str.replace(r'\s+\d+$', '', regex=True)
df_state_county.loc[df_state_county.State == 'ID', 'School_district'] = df_state_county.loc[df_state_county.State == 'ID', 'School_district'].str.replace(' Joint', '')
df_grading.loc[df_grading.State == 'ID', 'School_district'] = df_grading.loc[df_grading.State == 'ID', 'School_district'].str.replace(' Joint', '')

In [20]:
#clean data for ND
df_state_county.loc[df_state_county.State == 'ND', 'School_district'] = df_state_county.loc[df_state_county.State == 'ND', 'School_district'].str.replace(r'\s+\d+$', '', regex=True)
df_state_county.loc[df_state_county.State == 'ND', 'School_district'] = df_state_county.loc[df_state_county.State == 'ND', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'ND', 'School_district'] = df_state_county.loc[df_state_county.State == 'ND', 'School_district'].str.replace(' Public Schools', '')
df_state_county.loc[df_state_county.State == 'ND', 'School_district'] = df_state_county.loc[df_state_county.State == 'ND', 'School_district'].str.replace(' Public School', '')
df_state_county.loc[df_state_county.State == 'ND', 'School_district'] = df_state_county.loc[df_state_county.State == 'ND', 'School_district'].str.replace(' Public', '')
df_state_county.loc[df_state_county.State == 'ND', 'School_district'] = df_state_county.loc[df_state_county.State == 'ND', 'School_district'].str.replace(' AFB', '')

df_grading.loc[df_grading.State == 'ND', 'School_district'] = df_grading.loc[df_grading.State == 'ND', 'School_district'].str.replace(r'\s+\d+$', '', regex=True)
df_grading.loc[df_grading.State == 'ND', 'School_district'] = df_grading.loc[df_grading.State == 'ND', 'School_district'].str.replace('/', ' ')
df_grading.loc[df_grading.State == 'ND', 'School_district'] = df_grading.loc[df_grading.State == 'ND', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'ND', 'School_district'] = df_grading.loc[df_grading.State == 'ND', 'School_district'].str.replace(' Public School', '')
df_grading.loc[df_grading.State == 'ND', 'School_district'] = df_grading.loc[df_grading.State == 'ND', 'School_district'].str.replace(' Public', '')
df_grading.loc[df_grading.State == 'ND', 'School_district'] = df_grading.loc[df_grading.State == 'ND', 'School_district'].str.replace(' School District', '')
df_grading.loc[df_grading.State == 'ND', 'School_district'] = df_grading.loc[df_grading.State == 'ND', 'School_district'].str.replace(' District', '')


In [21]:
#clean data for OR
df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace(r'\s+\d+\w+$', '', regex=True)
df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace(r'\s+\d+$', '', regex=True)
df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace('Bend La Pine Administrative', 'Bend LaPine')
df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace('West Linn', 'West Linn Wilsonville')
df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace('Three Rivers', 'Three Rivers/Josephine County')
df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace('North Wasco', 'North Wasco County')
df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace('Milton Freewater', 'Milton Freewater Unified')
df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace('Sherman', 'Sherman County')

df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace('Klamath Falls City School Districts', 'Klamath Falls City School District')
df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace('Crow Applegate Lorane Sd', 'Crow Applegate Lorane School District')

df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace('Greater Albany School District', 'Greater Albany Public School District')
df_state_county.loc[df_state_county.State == 'OR', 'School_district'] = df_state_county.loc[df_state_county.State == 'OR', 'School_district'].str.replace('Portland School District', 'Portland Public Schools')

df_grading.loc[df_grading.State == 'OR', 'School_district'] = df_grading.loc[df_grading.State == 'OR', 'School_district'].str.replace(r'\s+\d+$', '', regex=True)


In [22]:
#clean data for MA
df_state_county.loc[df_state_county.State == 'MA', 'School_district'] = df_state_county.loc[df_state_county.State == 'MA', 'School_district'].str.replace(' Schools District', '')
df_state_county.loc[df_state_county.State == 'MA', 'School_district'] = df_state_county.loc[df_state_county.State == 'MA', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'MA', 'School_district'] = df_state_county.loc[df_state_county.State == 'MA', 'School_district'].str.replace(' Regional', '')

df_grading.loc[df_grading.State == 'MA', 'School_district'] = df_grading.loc[df_grading.State == 'MA', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'MA', 'School_district'] = df_grading.loc[df_grading.State == 'MA', 'School_district'].str.replace(' Public School', '')
df_grading.loc[df_grading.State == 'MA', 'School_district'] = df_grading.loc[df_grading.State == 'MA', 'School_district'].str.replace(' Schools District', '')
df_grading.loc[df_grading.State == 'MA', 'School_district'] = df_grading.loc[df_grading.State == 'MA', 'School_district'].str.replace(' School District', '')
df_grading.loc[df_grading.State == 'MA', 'School_district'] = df_grading.loc[df_grading.State == 'MA', 'School_district'].str.replace(' Regional', '')
df_grading.loc[df_grading.State == 'MA', 'School_district'] = df_grading.loc[df_grading.State == 'MA', 'School_district'].str.replace('Public Schools of ', '')

In [23]:
#clean data for MT
df_state_county.loc[df_state_county.State == 'MT', 'School_district'] = df_state_county.loc[df_state_county.State == 'MT', 'School_district'].str.replace('  K 12 Schools', '')
df_state_county.loc[df_state_county.State == 'MT', 'School_district'] = df_state_county.loc[df_state_county.State == 'MT', 'School_district'].str.replace('  K 12', '')

df_state_county.loc[df_state_county.State == 'MT', 'School_district'] = df_state_county.loc[df_state_county.State == 'MT', 'School_district'].str.replace(' K 12 Schools', '')
df_state_county.loc[df_state_county.State == 'MT', 'School_district'] = df_state_county.loc[df_state_county.State == 'MT', 'School_district'].str.replace(' K 12', '')

df_state_county.loc[df_state_county.State == 'MT', 'School_district'] = df_state_county.loc[df_state_county.State == 'MT', 'School_district'].str.replace(' School', '')

df_grading.loc[df_grading.State == 'MT', 'School_district'] = df_grading.loc[df_grading.State == 'MT', 'School_district'].str.replace(' K 12', '')
df_grading.loc[df_grading.State == 'MT', 'School_district'] = df_grading.loc[df_grading.State == 'MT', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'MT', 'School_district'] = df_grading.loc[df_grading.State == 'MT', 'School_district'].str.replace(' High School District', '')
df_grading.loc[df_grading.State == 'MT', 'School_district'] = df_grading.loc[df_grading.State == 'MT', 'School_district'].str.replace(' School District', '')
df_grading.loc[df_grading.State == 'MT', 'School_district'] = df_grading.loc[df_grading.State == 'MT', 'School_district'].str.replace(' Schools', '')


In [24]:
#clean data for AZ
df_state_county.loc[df_state_county.State == 'AZ', 'School_district'] = df_state_county.loc[df_state_county.State == 'AZ', 'School_district'].str.replace(' Unified District', '')
df_state_county.loc[df_state_county.State == 'AZ', 'School_district'] = df_state_county.loc[df_state_county.State == 'AZ', 'School_district'].str.replace(' Unified School District', '')
df_state_county.loc[df_state_county.State == 'AZ', 'School_district'] = df_state_county.loc[df_state_county.State == 'AZ', 'School_district'].str.replace('Fort Huachuca Accommodation District', 'Fort Huachuca Accommodation School District')
df_state_county.loc[df_state_county.State == 'AZ', 'School_district'] = df_state_county.loc[df_state_county.State == 'AZ', 'School_district'].str.replace('J. O. Combs', 'J.O. Combs')
df_state_county.loc[df_state_county.State == 'AZ', 'School_district'] = df_state_county.loc[df_state_county.State == 'AZ', 'School_district'].str.replace('Sedona Oak Creek Joint', 'Sedona Oak Creek')

df_grading.loc[df_grading.State == 'AZ', 'School_district'] = df_grading.loc[df_grading.State == 'AZ', 'School_district'].str.replace(r'\s+\d+$', '', regex=True)
df_grading.loc[df_grading.State == 'AZ', 'School_district'] = df_grading.loc[df_grading.State == 'AZ', 'School_district'].str.replace(' Unified School District', '')
df_grading.loc[df_grading.State == 'AZ', 'School_district'] = df_grading.loc[df_grading.State == 'AZ', 'School_district'].str.replace(' Public Schools', '')


  df_state_county.loc[df_state_county.State == 'AZ', 'School_district'] = df_state_county.loc[df_state_county.State == 'AZ', 'School_district'].str.replace('J. O. Combs', 'J.O. Combs')


In [25]:
#clean data for VT
df_state_county.loc[df_state_county.State == 'VT', 'School_district'] = df_state_county.loc[df_state_county.State == 'VT', 'School_district'].str.replace(' Educational Community Unified School District', '')
df_state_county.loc[df_state_county.State == 'VT', 'School_district'] = df_state_county.loc[df_state_county.State == 'VT', 'School_district'].str.replace(' Supervisory Union', '')
df_state_county.loc[df_state_county.State == 'VT', 'School_district'] = df_state_county.loc[df_state_county.State == 'VT', 'School_district'].str.replace(' Unified School District', '')
df_state_county.loc[df_state_county.State == 'VT', 'School_district'] = df_state_county.loc[df_state_county.State == 'VT', 'School_district'].str.replace(' Union Elementary', '')
df_state_county.loc[df_state_county.State == 'VT', 'School_district'] = df_state_county.loc[df_state_county.State == 'VT', 'School_district'].str.replace(' Union', '')
df_state_county.loc[df_state_county.State == 'VT', 'School_district'] = df_state_county.loc[df_state_county.State == 'VT', 'School_district'].str.replace(' Supervisory District', '')
df_state_county.loc[df_state_county.State == 'VT', 'School_district'] = df_state_county.loc[df_state_county.State == 'VT', 'School_district'].str.replace(' Valley', '')

df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(' Educational Community Unified School District', '')
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(' Supervisory Union', '')
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(' Unified School District', '')
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(' School District', '')
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(r'\s+\d+\w+$', '', regex=True)
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(' Union Elementary', '')
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(' Union', '')
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(' Incorporated', '')
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(' Public', '')
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(' Senior High', '')
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace(' Cooperative', '')
df_grading.loc[df_grading.State == 'VT', 'School_district'] = df_grading.loc[df_grading.State == 'VT', 'School_district'].str.replace('Caledonia', 'Caledonia Central')


In [26]:
#clean data for ME
df_state_county.loc[df_state_county.State == 'ME', 'School_district'] = df_state_county.loc[df_state_county.State == 'ME', 'School_district'].str.replace(' Public Schools', '')
df_state_county.loc[df_state_county.State == 'ME', 'School_district'] = df_state_county.loc[df_state_county.State == 'ME', 'School_district'].str.replace(' Public School', '')
df_state_county.loc[df_state_county.State == 'ME', 'School_district'] = df_state_county.loc[df_state_county.State == 'ME', 'School_district'].str.replace('Regional School Unit', 'School Administrative District')

df_grading.loc[df_grading.State == 'ME', 'School_district'] = df_grading.loc[df_grading.State == 'ME', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'ME', 'School_district'] = df_grading.loc[df_grading.State == 'ME', 'School_district'].str.replace(' Public School', '') 
df_grading.loc[df_grading.State == 'ME', 'School_district'] = df_grading.loc[df_grading.State == 'ME', 'School_district'].str.replace('Regional School Unit', 'School Administrative District')
df_grading.loc[df_grading.State == 'ME', 'School_district'] = df_grading.loc[df_grading.State == 'ME', 'School_district'].str.replace('RSU', 'School Administrative District') 
df_grading.loc[df_grading.State == 'ME', 'School_district'] = df_grading.loc[df_grading.State == 'ME', 'School_district'].str.replace('Boothbay Boothbay Harbor Central School District', 'Boothbay Boothbay Harbor Community School District') 

In [27]:
#clean data for NJ
df_state_county.loc[df_state_county.State == 'NJ', 'School_district'] = df_state_county.loc[df_state_county.State == 'NJ', 'School_district'].str.replace(' Public Schools', '')
df_state_county.loc[df_state_county.State == 'NJ', 'School_district'] = df_state_county.loc[df_state_county.State == 'NJ', 'School_district'].str.replace(' Public School', '')
df_state_county.loc[df_state_county.State == 'NJ', 'School_district'] = df_state_county.loc[df_state_county.State == 'NJ', 'School_district'].str.replace(' School District', '') 
df_state_county.loc[df_state_county.State == 'NJ', 'School_district'] = df_state_county.loc[df_state_county.State == 'NJ', 'School_district'].str.replace(' Borough', '') 
df_state_county.loc[df_state_county.State == 'NJ', 'School_district'] = df_state_county.loc[df_state_county.State == 'NJ', 'School_district'].str.replace('Township', 'Town') 
df_state_county.loc[df_state_county.State == 'NJ', 'School_district'] = df_state_county.loc[df_state_county.State == 'NJ', 'School_district'].str.replace(' Town', '') 
df_state_county.loc[df_state_county.State == 'NJ', 'School_district'] = df_state_county.loc[df_state_county.State == 'NJ', 'School_district'].str.replace(' City', '') 

df_grading.loc[df_grading.State == 'NJ', 'School_district'] = df_grading.loc[df_grading.State == 'NJ', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'NJ', 'School_district'] = df_grading.loc[df_grading.State == 'NJ', 'School_district'].str.replace(' Public School', '') 
df_grading.loc[df_grading.State == 'NJ', 'School_district'] = df_grading.loc[df_grading.State == 'NJ', 'School_district'].str.replace(' School District', '')
df_grading.loc[df_grading.State == 'NJ', 'School_district'] = df_grading.loc[df_grading.State == 'NJ', 'School_district'].str.replace(' District', '') 
df_grading.loc[df_grading.State == 'NJ', 'School_district'] = df_grading.loc[df_grading.State == 'NJ', 'School_district'].str.replace(' Borough', '') 
df_grading.loc[df_grading.State == 'NJ', 'School_district'] = df_grading.loc[df_grading.State == 'NJ', 'School_district'].str.replace('Township', 'Town') 
df_grading.loc[df_grading.State == 'NJ', 'School_district'] = df_grading.loc[df_grading.State == 'NJ', 'School_district'].str.replace(' Town', '')
df_grading.loc[df_grading.State == 'NJ', 'School_district'] = df_grading.loc[df_grading.State == 'NJ', 'School_district'].str.replace(' City', '') 
df_grading.loc[df_grading.State == 'NJ', 'School_district'] = df_grading.loc[df_grading.State == 'NJ', 'School_district'].str.replace(' Board of Education', '')

In [28]:
#clean data for RI
df_state_county.loc[df_state_county.State == 'RI', 'School_district'] = df_state_county.loc[df_state_county.State == 'RI', 'School_district'].str.replace(' Public Schools', '')
df_state_county.loc[df_state_county.State == 'RI', 'School_district'] = df_state_county.loc[df_state_county.State == 'RI', 'School_district'].str.replace(' Public School', '')
df_state_county.loc[df_state_county.State == 'RI', 'School_district'] = df_state_county.loc[df_state_county.State == 'RI', 'School_district'].str.replace(' Schools District', '') 
df_state_county.loc[df_state_county.State == 'RI', 'School_district'] = df_state_county.loc[df_state_county.State == 'RI', 'School_district'].str.replace(' School District', '')

df_grading.loc[df_grading.State == 'RI', 'School_district'] = df_grading.loc[df_grading.State == 'RI', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'RI', 'School_district'] = df_grading.loc[df_grading.State == 'RI', 'School_district'].str.replace(' Public School', '') 
df_grading.loc[df_grading.State == 'RI', 'School_district'] = df_grading.loc[df_grading.State == 'RI', 'School_district'].str.replace(' Schools District', '')
df_grading.loc[df_grading.State == 'RI', 'School_district'] = df_grading.loc[df_grading.State == 'RI', 'School_district'].str.replace(' School District', '') 
df_grading.loc[df_grading.State == 'RI', 'School_district'] = df_grading.loc[df_grading.State == 'RI', 'School_district'].str.replace(' School Department', '')
df_grading.loc[df_grading.State == 'RI', 'School_district'] = df_grading.loc[df_grading.State == 'RI', 'School_district'].str.replace(' School System', '')

In [29]:
#clean data for CO

df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' School District' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' School District RE' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' School District R' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' School District RJ' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' School District J' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' School District C' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)

df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' School District RE1', '')   
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' RE 2', '') 
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' School District 10 JT R', '')
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' Public Schools', '')
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' Public School', '')
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' Public', '')
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace('School District 49', 'Falcon')
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' Reorganized', '')
df_state_county.loc[df_state_county.State == 'CO', 'School_district'] = df_state_county.loc[df_state_county.State == 'CO', 'School_district'].str.replace(' Schools', '')

df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' School District' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' School District RE' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' School District R' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' School District RJ' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' School District J' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True)
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' School District C' + r'\s+\d+\s?\w?\s?\w?$', '', regex=True) 

df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' School District', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Public School', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' School Online', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' RE 2', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Schools', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Joint', '')  
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' R4 J', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace('Sangre De Cristo', 'Sangre de Cristo')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace('St Vrain Valley', 'St. Vrain Valley')   
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Re. 1J', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Re 12', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Re 1', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' RJ1', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' RE3', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace('Canon City Fremont RE 1', 'Cañon City')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Public', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Consolidated', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Reorganized', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace('School District Re. 4 Buffalo', 'Buffalo')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' R1', '')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace('Platte Canyon District', 'Platte Canyon')
df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.strip()


  df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace(' Re. 1J', '')
  df_grading.loc[df_grading.State == 'CO', 'School_district'] = df_grading.loc[df_grading.State == 'CO', 'School_district'].str.replace('School District Re. 4 Buffalo', 'Buffalo')


In [30]:
#clean data for FL
df_state_county.loc[df_state_county.State == 'FL', 'School_district'] = df_state_county.loc[df_state_county.State == 'FL', 'School_district'].str.replace(' Public Schools', '')
df_state_county.loc[df_state_county.State == 'FL', 'School_district'] = df_state_county.loc[df_state_county.State == 'FL', 'School_district'].str.replace(' Public School', '')
df_state_county.loc[df_state_county.State == 'FL', 'School_district'] = df_state_county.loc[df_state_county.State == 'FL', 'School_district'].str.replace(' Schools District', '') 
df_state_county.loc[df_state_county.State == 'FL', 'School_district'] = df_state_county.loc[df_state_county.State == 'FL', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'FL', 'School_district'] = df_state_county.loc[df_state_county.State == 'FL', 'School_district'].str.replace(' District Schools ', '') 
df_state_county.loc[df_state_county.State == 'FL', 'School_district'] = df_state_county.loc[df_state_county.State == 'FL', 'School_district'].str.replace(' District School', '')
df_state_county.loc[df_state_county.State == 'FL', 'School_district'] = df_state_county.loc[df_state_county.State == 'FL', 'School_district'].str.replace(' County', '')

df_grading.loc[df_grading.State == 'FL', 'School_district'] = df_grading.loc[df_grading.State == 'FL', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'FL', 'School_district'] = df_grading.loc[df_grading.State == 'FL', 'School_district'].str.replace(' Public School', '') 
df_grading.loc[df_grading.State == 'FL', 'School_district'] = df_grading.loc[df_grading.State == 'FL', 'School_district'].str.replace(' Schools District', '')
df_grading.loc[df_grading.State == 'FL', 'School_district'] = df_grading.loc[df_grading.State == 'FL', 'School_district'].str.replace(' School District', '') 
df_grading.loc[df_grading.State == 'FL', 'School_district'] = df_grading.loc[df_grading.State == 'FL', 'School_district'].str.replace(' District Schools', '')
df_grading.loc[df_grading.State == 'FL', 'School_district'] = df_grading.loc[df_grading.State == 'FL', 'School_district'].str.replace(' District School', '')
df_grading.loc[df_grading.State == 'FL', 'School_district'] = df_grading.loc[df_grading.State == 'FL', 'School_district'].str.replace(' County', '') 
df_grading.loc[df_grading.State == 'FL', 'School_district'] = df_grading.loc[df_grading.State == 'FL', 'School_district'].str.replace(' Schools', '')


In [31]:
#clean data for SC
df_state_county.loc[df_state_county.State == 'SC', 'School_district'] = df_state_county.loc[df_state_county.State == 'SC', 'School_district'].str.replace(' Public Schools', '')
df_state_county.loc[df_state_county.State == 'SC', 'School_district'] = df_state_county.loc[df_state_county.State == 'SC', 'School_district'].str.replace(' Public School', '')
df_state_county.loc[df_state_county.State == 'SC', 'School_district'] = df_state_county.loc[df_state_county.State == 'SC', 'School_district'].str.replace(' Schools District', '')
df_state_county.loc[df_state_county.State == 'SC', 'School_district'] = df_state_county.loc[df_state_county.State == 'SC', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'SC', 'School_district'] = df_state_county.loc[df_state_county.State == 'SC', 'School_district'].str.replace(' Schools', '')
df_state_county.loc[df_state_county.State == 'SC', 'School_district'] = df_state_county.loc[df_state_county.State == 'SC', 'School_district'].str.replace(' School', '')
df_state_county.loc[df_state_county.State == 'SC', 'School_district'] = df_state_county.loc[df_state_county.State == 'SC', 'School_district'].str.replace(' County', '')
df_state_county.loc[df_state_county.State == 'SC', 'School_district'] = df_state_county.loc[df_state_county.State == 'SC', 'School_district'].str.replace(' Consolidated', '')

df_grading.loc[df_grading.State == 'SC', 'School_district'] = df_grading.loc[df_grading.State == 'SC', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'SC', 'School_district'] = df_grading.loc[df_grading.State == 'SC', 'School_district'].str.replace(' Public School', '')
df_grading.loc[df_grading.State == 'SC', 'School_district'] = df_grading.loc[df_grading.State == 'SC', 'School_district'].str.replace(' Schools District', '')
df_grading.loc[df_grading.State == 'SC', 'School_district'] = df_grading.loc[df_grading.State == 'SC', 'School_district'].str.replace(' School District', '')
df_grading.loc[df_grading.State == 'SC', 'School_district'] = df_grading.loc[df_grading.State == 'SC', 'School_district'].str.replace(' Schools', '')
df_grading.loc[df_grading.State == 'SC', 'School_district'] = df_grading.loc[df_grading.State == 'SC', 'School_district'].str.replace(' School', '')
df_grading.loc[df_grading.State == 'SC', 'School_district'] = df_grading.loc[df_grading.State == 'SC', 'School_district'].str.replace(' County', '')
df_grading.loc[df_grading.State == 'SC', 'School_district'] = df_grading.loc[df_grading.State == 'SC', 'School_district'].str.replace('Lexington Richland 5', 'Lexington 5')
df_grading.loc[df_grading.State == 'SC', 'School_district'] = df_grading.loc[df_grading.State == 'SC', 'School_district'].str.replace('Spartanburg Seven', 'Spartanburg 7')
df_grading.loc[df_grading.State == 'SC', 'School_district'] = df_grading.loc[df_grading.State == 'SC', 'School_district'].str.replace(' Consolidated', '')



In [32]:
#clean data for AR
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' Schools District', '')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' Public Schools', '')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' Public School', '')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' Consolidated School District', '')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' Consolidated Schools', '')  
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' Consolidated', '')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' Cty', ' County')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' Schools', '')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' School', '')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace(' Technical', ' Tech')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace('Malvern Special', 'Malvern')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace('Emerson Taylor Bradley', 'Emerson Taylor')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace('Smackover Norphlet', 'Smackover')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace('Marvell', 'Marvell Elaine')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace('Mount Vernon Enola', 'Mt. Vernon/Enola')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace('South Side Bee Branch', 'South Side')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace('Deer/Mount Judea', 'Deer/Mt. Judea')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace('De Queen', 'DeQueen')
df_state_county.loc[df_state_county.State == 'AR', 'School_district'] = df_state_county.loc[df_state_county.State == 'AR', 'School_district'].str.replace('Strong', 'Strong Huttig')

df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace('Dewitt', 'DeWitt')
df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace(' Schools District', '')
df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace(' School District', '')
df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace(' Public Schools', '')
df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace(' Public School', '')
df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace(' Consolidated School District', '')
df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace(' Consolidated Schools', '')
df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace(' Consolidated', '')
df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace(' Cty', ' County')
df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace(' Schools', '')
df_grading.loc[df_grading.State == 'AR', 'School_district'] = df_grading.loc[df_grading.State == 'AR', 'School_district'].str.replace(' School', '')



In [33]:
#clearn data for UT
df_state_county.loc[df_state_county.State == 'UT', 'School_district'] = df_state_county.loc[df_state_county.State == 'UT', 'School_district'].str.replace(' County', '')
df_state_county.loc[df_state_county.State == 'UT', 'School_district'] = df_state_county.loc[df_state_county.State == 'UT', 'School_district'].str.replace(' City', '')

df_grading.loc[df_grading.State == 'UT', 'School_district'] = df_grading.loc[df_grading.State == 'UT', 'School_district'].str.replace(' County', '')
df_grading.loc[df_grading.State == 'UT', 'School_district'] = df_grading.loc[df_grading.State == 'UT', 'School_district'].str.replace(' City', '')



In [34]:
#clean data for TN
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace(' Schools District', '')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace(' School District', '')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace(' Schools System', '')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace(' School System', '')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace(' Schools', '')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace(' School', '')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace(' Municipal', '')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace(' District', '')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace('Metropolitan', 'Metro')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace('Oak Ridge City', 'Oak Ridge')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace('Gibson County', 'Gibson County Special')
df_state_county.loc[df_state_county.State == 'TN', 'School_district'] = df_state_county.loc[df_state_county.State == 'TN', 'School_district'].str.replace('Jackson Madison County', 'Jackson County')

df_grading.loc[df_grading.State == 'TN', 'School_district'] = df_grading.loc[df_grading.State == 'TN', 'School_district'].str.replace(' Schools District', '')
df_grading.loc[df_grading.State == 'TN', 'School_district'] = df_grading.loc[df_grading.State == 'TN', 'School_district'].str.replace(' School District', '')
df_grading.loc[df_grading.State == 'TN', 'School_district'] = df_grading.loc[df_grading.State == 'TN', 'School_district'].str.replace(' Schools System', '')
df_grading.loc[df_grading.State == 'TN', 'School_district'] = df_grading.loc[df_grading.State == 'TN', 'School_district'].str.replace(' School System', '')
df_grading.loc[df_grading.State == 'TN', 'School_district'] = df_grading.loc[df_grading.State == 'TN', 'School_district'].str.replace(' Schools', '')
df_grading.loc[df_grading.State == 'TN', 'School_district'] = df_grading.loc[df_grading.State == 'TN', 'School_district'].str.replace(' School', '')
df_grading.loc[df_grading.State == 'TN', 'School_district'] = df_grading.loc[df_grading.State == 'TN', 'School_district'].str.replace(' Municipal', '')
df_grading.loc[df_grading.State == 'TN', 'School_district'] = df_grading.loc[df_grading.State == 'TN', 'School_district'].str.replace(' District', '')
df_grading.loc[df_grading.State == 'TN', 'School_district'] = df_grading.loc[df_grading.State == 'TN', 'School_district'].str.replace('Dekalb', 'DeKalb')


In [35]:
#clean data for NY
df_state_county.loc[df_state_county.State == 'NY', 'School_district'] = df_state_county.loc[df_state_county.State == 'NY', 'School_district'].str.replace(r'\(+\w+\s?\w+\)$', '', regex=True)
df_state_county.loc[df_state_county.State == 'NY', 'School_district'] = df_state_county.loc[df_state_county.State == 'NY', 'School_district'].str.strip()

df_grading.loc[df_grading.State == 'NY', 'School_district'] = df_grading.loc[df_grading.State == 'NY', 'School_district'].str.replace('New York City Geographic District No. ' + r'\d+$', 'New York City Department Of Education', regex=True)


In [38]:
#merge school rating df and county level school district df using state and school district as key
result = pd.merge(df_state_county, df_grading, on = ['State','School_district'], how = 'left')

In [48]:
#clean the combined df to just show necassary columns
result = result[['State', 'County_name', 'County_code', 'Niche_grade_coverted_to_score']]
result = result.dropna(subset=['Niche_grade_coverted_to_score'])

In [50]:
#show average school rating for the school districts within the same county
result = result.groupby(['State', 'County_name', 'County_code']).mean()

In [51]:
#show the final result
result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Niche_grade_coverted_to_score
State,County_name,County_code,Unnamed: 3_level_1
AK,aleutians east,c02013,88.0
AK,aleutians west,c02016,85.5
AK,anchorage,c02020,91.0
AK,bethel,c02050,72.1
AK,bristol bay,c02060,84.5
...,...,...,...
WY,sweetwater,c56037,79.0
WY,teton,c56039,91.0
WY,uinta,c56041,80.0
WY,washakie,c56043,86.0


In [52]:
# to convert df to csv
result.to_csv('sates_county_level_school_district_rating.csv')