In [95]:
import pandas as pd
import numpy as np
import config
import json
import requests

import fiona
import geopandas as gpd

import warnings
warnings.filterwarnings("ignore")

In [2]:
from uszipcode import SearchEngine

### We only want to keep target states
FL    12<br>
VA    51<br>
NC    37<br>
GA    13<br>
WV    54<br>
MD    24<br>
SC    45<br>
DC    11<br>

## (1) School Districts geoJSON to grab list of GEOIDS & School Districts

In [3]:
#upload geojson school boundaries file
with open('../nicole/clean_data/school_districts_simplified.json') as f2:
    schools_geodata = json.load(f2)
#schools_geodata['features']

In [92]:
#make a df of GEOIDs and school districts
dict_schools = {}
for each in schools_geodata['features']:
    dict_schools[each['properties']['GEOID']] = each['properties']['NAME']

geoid_schools = pd.DataFrame.from_dict(dict_schools, orient='index').reset_index()
geoid_schools.columns = ['GEOID','school_district']

## (2) GreatSchools.org Scrape Demographics

In [47]:
#upload school district info from greatschools.org scrape
df = pd.read_json('../jack/General/data/great_schools.json') 
df['pctLowIncome'] = df['pctLowIncome'].str.rstrip('%').replace('?', -1).astype(float)
df['pctLowIncome'] = df['pctLowIncome'].replace(-1, np.nan)
#historical = historical[df["LOT SIZE"].notna()]

In [71]:
#grab just the relevant columns we want from greatschools df
greatschools_agg = df.groupby('districtName').agg(avg_rating =('overallRating','mean'), total_enrollment= ('enrollment', 'sum'),
                                                  num_schools=('enrollment','nunique'), avg_pct_low_income=('pctLowIncome','mean'),
                                                  state = ('state', lambda x: x.value_counts().index[0])).round(1).reset_index()
greatschools_agg = greatschools_agg.rename(columns={'districtName':'school_district'})
greatschools_agg = greatschools_agg[greatschools_agg['total_enrollment'] > 15]

# df1 = (df.groupby(['road','dirn'], as_index=False)
#          .agg({'length':'sum','lane':lambda x: x.value_counts().index[0]}))

#df2['school_district'] = df2['school_district'].str.replace(r'\School\b','School District')
#df2['school_district'] = df2['school_district'].str.replace(r'\Schools\b','School District')
#df3 = df2[df2['school_district'].str.contains(r'\School\b')]
#df2.loc[df2['school_district'] == 'Lee County Schools']

greatschools_agg

Unnamed: 0,school_district,avg_rating,total_enrollment,num_schools,avg_pct_low_income,state
1,A.C.E. Academy,5.0,409,1,56.0,NC
2,Abbeville 60,6.7,3012,9,68.5,SC
3,Accomack Co Public Schools,4.6,5115,12,64.5,VA
4,Achievement Preparatory Academy PCS,3.0,825,2,76.0,DC
5,Ahfachkee Day School,,177,1,100.0,FL
...,...,...,...,...,...,...
1077,York 04,7.9,16081,17,16.0,SC
1078,York Co Public Schools,7.0,12750,18,23.8,VA
1079,Youngsville Academy,5.0,332,1,3.0,NC
1080,Youthbuild PCS,,302,2,70.0,DC


In [72]:
greatschools_agg = greatschools_agg.sort_values('total_enrollment', ascending = False).head(750)

#### Save to CSVs

In [73]:
#from geoJSON to grab geoIDS
geoid_schools.to_csv('../nicole/clean_data/geoIDs_district_names.csv')

#from greatschools.org
greatschools_agg.to_csv('../nicole/clean_data/greatschools_scrape.csv')

#next step is to merge these two....

## (3) Matching GreatSchools.org districts with GeoIDs

- Finding School Districts by 'first word'

In [74]:
#import csv of first words of every school district (did this in excel)
first_words = pd.read_csv('../nicole/clean_data/first_word_geoID.csv')
list_first_words= first_words['first_word'].tolist()

# list of sentences form greatschools
list_sentences = greatschools_agg['school_district'].tolist()

#list_first_words

In [76]:
#matching first words with sentences that contain said word
dict_sentence_word = {}
for word in list_first_words:
    for i, sentence in enumerate(list_sentences):
        if word in sentence:
            dict_sentence_word[sentence] = word     
dict_sentence_word

{'District of Columbia Public Schools': 'Columbia',
 'District of Columbia International School': 'Columbia',
 'Alachua': 'Alachua',
 'Baker': 'Baker',
 'Bay': 'Bay',
 'Bradford': 'Bradford',
 'Bradford Preparatory School': 'Bradford',
 'Brevard': 'Brevard',
 'Broward': 'Broward',
 'Calhoun City': 'Calhoun',
 'Calhoun': 'Calhoun',
 'Calhoun 01': 'Calhoun',
 'Calhoun County Schools': 'Calhoun',
 'Charlotte-Mecklenburg Schools': 'Mecklenburg',
 'Charlotte': 'Charlotte',
 'Charlottesville Cty Public Schools': 'Charlottesville',
 'Charlotte Co Public Schools': 'Charlotte',
 'KIPP: Charlotte': 'Charlotte',
 'Charlotte Lab School': 'Charlotte',
 'Citrus': 'Citrus',
 'Clayton County': 'Clay',
 'Clay': 'Clay',
 'Clay County Schools': 'Clay',
 'Collier': 'Collier',
 'Columbia County': 'Columbia',
 'Columbia': 'Columbia',
 'Miami-Dade': 'Dade',
 'Dade County': 'Dade',
 'Dixie': 'Dixie',
 'Duval': 'Duval',
 'Escambia': 'Escambia',
 'Flagler': 'Flagler',
 'Franklin County Schools': 'Franklin',
 'F

In [77]:
#finding the first words with multiple sentences
keys_per_value = {}
for sentence, first_word in dict_sentence_word.items():
    keys_per_value.setdefault(first_word, set()).add(sentence)
    
counts = {k: len(v) for k, v in keys_per_value.items()}
#counts

In [78]:
#making new list of first words that have 1:1 match with sentences
list_remove_first_words = list()
for k,v in counts.items():
    if v != 1:
        list_remove_first_words.append(k)
        
#list_final_first_words

In [79]:
#final dictionary mapping first words with sentences, indicating 1:1 match relationship

for key, value in dict(dict_sentence_word).items():
    if value in list_remove_first_words:
         del dict_sentence_word[key]

In [711]:
#dict_sentence_word

In [80]:
df_matched = pd.DataFrame.from_dict(dict_sentence_word, orient='index').reset_index()
df_matched.columns=['school_district','first_word']

df_matched.head()

#df_matched.merge(df_matched, on='school_district', how="left")

Unnamed: 0,school_district,first_word
0,Alachua,Alachua
1,Baker,Baker
2,Bay,Bay
3,Brevard,Brevard
4,Broward,Broward


- #### Merging GreatSchools.org scrape and School Boundaries JSON on 'school_district' & 'GEOID'

In [86]:
combined = greatschools_agg.merge(df_matched, on='school_district', how="left").merge(
        first_words, on='first_word', how="left").merge(geoid_schools, on='school_district', how='left')


combined['GEOID'] = combined['GEOID_x'].fillna(combined['GEOID_y']).astype(float)
combined = combined.drop(['GEOID_x','GEOID_y'], axis = 1)
combined = combined.sort_values('GEOID').head(481).reset_index()
#combined.GEOID.value_counts()
combined

Unnamed: 0,index,school_district,avg_rating,total_enrollment,num_schools,avg_pct_low_income,state,first_word,school_district_id,GEOID
0,42,District of Columbia Public Schools,5.7,48218,104,81.2,DC,,,1100030.0
1,71,Alachua,5.0,29834,60,52.2,FL,Alachua,Alachua County School District,1200030.0
2,312,Baker,5.2,5060,7,33.6,FL,Baker,Baker County School District,1200060.0
3,97,Bay,4.9,23100,38,58.4,FL,Bay,Bay County School District,1200090.0
4,26,Brevard,5.8,73331,100,49.7,FL,Brevard,Brevard County School District,1200150.0
...,...,...,...,...,...,...,...,...,...,...
476,607,Webster County Schools,4.2,1315,4,69.8,WV,Webster,Webster County School District,5401530.0
477,476,Wetzel County Schools,4.5,2527,9,53.6,WV,Wetzel,Wetzel County School District,5401560.0
478,657,Wirt County Schools,3.0,1010,3,50.7,WV,Wirt,Wirt County School District,5401590.0
479,163,Wood County Schools,6.3,12423,28,52.2,WV,Wood,Wood County School District,5401620.0


In [87]:
#geoid stuff
matched_geoid = pd.DataFrame(combined[combined['GEOID'].notna()].GEOID).astype(int).astype(object)

not_matched_geoid = geoid_schools.merge(matched_geoid.drop_duplicates(), on=['GEOID'], 
                                      how='left', indicator=True).query('_merge=="left_only"')

In [88]:
#combined['GEOID'].duplicated().sort_values(ascending = True).head(460)
#combined['GEOID'].isna()

combined_na = combined[combined['GEOID'].isna()].sort_values('total_enrollment', ascending = False).head(160)
combined_na

Unnamed: 0,index,school_district,avg_rating,total_enrollment,num_schools,avg_pct_low_income,state,first_word,school_district_id,GEOID


#### Save CSVs

In [104]:
#GEOIDs that didnt match
not_matched_geoid.to_csv('../nicole/clean_data/not_matched_geoID.csv')

#school districts that didn't match
combined_na.to_csv('../nicole/clean_data/not_matched_school_district_NA.csv')

#final merge of (1) & (2)
combined['school_district_id'][295] = 'District of Columbia Public Schools'
combined['GEOID'] = combined['GEOID'].astype(int).astype(object)
combined.to_csv('../nicole/clean_data/school_districts_greatschools_demographics.csv')

In [762]:
#sanity check
#df2.loc[df2['school_district'] == 'Alachua']
#df.dropna()[df.dropna()['districtName'].str.contains('Alachua')]

## (4) Historical & Current Redfin Data

In [98]:
pd.set_option('display.max_columns', 500)

In [96]:
historical = pd.read_csv('https://nycdsacapstone2021.blob.core.windows.net/time-series/combined_redfin_historical_12-05-21.csv')

In [97]:
historical = historical.drop_duplicates()
historical = historical.drop(columns=['NEXT OPEN HOUSE START TIME','NEXT OPEN HOUSE END TIME'
                                      ,'FAVORITE','SOURCE'
                                      ,'HOA/MONTH', 'MLS#'])

In [99]:
historical = historical[historical["YEAR BUILT"].notna()]
historical = historical[historical["BEDS"].notna()]
historical = historical[historical["SQUARE FEET"].notna()]
historical = historical[historical["ZIP OR POSTAL CODE"].notna()]
historical["STATUS"]= historical["STATUS"].apply(lambda x: "Sold" if x=="Sold" else "Active")
historical["YEAR BUILT"] = historical["YEAR BUILT"].astype("int")
historical["PRICE"] = historical["PRICE"].astype("int")
historical["SOLD DATE"] = pd.to_datetime(historical["SOLD DATE"].fillna(0))
historical["Year_Sold"] = historical["SOLD DATE"].apply(lambda x: x.year)
historical["Year_Sold"] = historical["Year_Sold"].replace(1970, 0)

In [100]:
historical["Log_price"] = np.log(historical["PRICE"])

In [101]:
historical.to_csv('../nicole/clean_data/historical_cleaned.csv')

# Files to use for mapping

In [94]:
print('school_districts_greatschools_demographics.csv')
combined

mapping_data.csv


Unnamed: 0,index,school_district,avg_rating,total_enrollment,num_schools,avg_pct_low_income,state,first_word,school_district_id,GEOID
0,42,District of Columbia Public Schools,5.7,48218,104,81.2,DC,,,1100030
1,71,Alachua,5.0,29834,60,52.2,FL,Alachua,Alachua County School District,1200030
2,312,Baker,5.2,5060,7,33.6,FL,Baker,Baker County School District,1200060
3,97,Bay,4.9,23100,38,58.4,FL,Bay,Bay County School District,1200090
4,26,Brevard,5.8,73331,100,49.7,FL,Brevard,Brevard County School District,1200150
...,...,...,...,...,...,...,...,...,...,...
476,607,Webster County Schools,4.2,1315,4,69.8,WV,Webster,Webster County School District,5401530
477,476,Wetzel County Schools,4.5,2527,9,53.6,WV,Wetzel,Wetzel County School District,5401560
478,657,Wirt County Schools,3.0,1010,3,50.7,WV,Wirt,Wirt County School District,5401590
479,163,Wood County Schools,6.3,12423,28,52.2,WV,Wood,Wood County School District,5401620


In [103]:
print('historical_cleaned.csv')
historical

historical_cleaned.csv


Unnamed: 0,SALE TYPE,SOLD DATE,PROPERTY TYPE,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,DAYS ON MARKET,$/SQUARE FEET,STATUS,URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING),INTERESTED,LATITUDE,LONGITUDE,Year_Sold,Log_price
0,MLS Listing,1970-01-01,Single Family Residential,1412 Lee Ln,Hopewell,VA,23860,169900,3.0,2.0,Belmont,1040.0,10018.0,1997,2.0,163.0,Active,https://www.redfin.com/VA/Hopewell/1412-Lee-Ln...,Y,37.273772,-77.301207,0,12.042965
1,MLS Listing,1970-01-01,Single Family Residential,812 North Ave,Hopewell,VA,23860,275000,4.0,3.0,FARMINGDALE,2506.0,,1971,3.0,110.0,Active,https://www.redfin.com/VA/Hopewell/812-North-A...,Y,37.279344,-77.316350,0,12.524526
2,MLS Listing,1970-01-01,Single Family Residential,102 S Colonial Dr,Hopewell,VA,23860,242950,3.0,2.0,Edwin Sims Property,1611.0,12632.0,1960,6.0,151.0,Active,https://www.redfin.com/VA/Hopewell/102-S-Colon...,Y,37.297638,-77.319467,0,12.400611
3,MLS Listing,1970-01-01,Single Family Residential,3919 Robin Hood Dr,Hopewell,VA,23860,199000,4.0,2.0,Prince George Court Section F,1663.0,13503.0,1984,6.0,120.0,Active,https://www.redfin.com/VA/Hopewell/3919-Robin-...,Y,37.303146,-77.331629,0,12.201060
4,MLS Listing,1970-01-01,Single Family Residential,3028 Grace St,Hopewell,VA,23860,229750,3.0,2.0,Kenilworth,1484.0,14810.0,1955,10.0,155.0,Active,https://www.redfin.com/VA/Hopewell/3028-Grace-...,Y,37.298924,-77.315952,0,12.344747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304225,PAST SALE,2019-10-31,Single Family Residential,318 N Linden Ave,Highland Springs,VA,23075,180000,3.0,2.0,Greendale Park,2131.0,39639.0,1955,766.0,84.0,Sold,https://www.redfin.com/VA/Highland-Springs/318...,Y,37.551286,-77.316266,2019,12.100712
304226,PAST SALE,2019-09-20,Single Family Residential,401 Kramer Dr,Highland Springs,VA,23075,167000,3.0,2.0,Meadowview,1688.0,11325.0,1958,807.0,99.0,Sold,https://www.redfin.com/VA/Highland-Springs/401...,Y,37.553499,-77.323805,2019,12.025749
304227,PAST SALE,2019-01-14,Townhouse,302 Bernie Ct,Henrico,VA,23075,63000,3.0,1.5,Fairlawn Townhouses,1061.0,1742.0,1977,1056.0,59.0,Sold,https://www.redfin.com/VA/Highland-Springs/302...,Y,37.535703,-77.314729,2019,11.050890
304228,PAST SALE,2020-12-21,Townhouse,319 Bernie Ct,Henrico,VA,23075,73000,3.0,1.5,Fairlawn,1061.0,1742.0,1977,349.0,69.0,Sold,https://www.redfin.com/VA/Highland-Springs/319...,Y,37.535871,-77.313851,2020,11.198215


## Upload to Azure Storage (?)

In [70]:
from azure.storage.blob import BlobClient
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__

import os

# IMPORTANT: Replace connection string with your storage account connection string
# Usually starts with DefaultEndpointsProtocol=https;...

CONNECTIONSTRING = os.environ.get("AZURE_STORAGE_CONNECTIONSTRING", dtype=str)
CONNECTIONSTRING

    
# Replace with blob container. This should be already created in azure storage.
MY_IMAGE_CONTAINER = "mapping"
 
# Replace with the local folder which contains the image files for upload
LOCAL_IMAGE_PATH = "../nicole/clean_data/mapping_data.csv"
 
class AzureBlobFileUploader:
    def __init__(self):
        print("Intializing AzureBlobFileUploader")

    # Initialize the connection to Azure storage account
    self.blob_service_client =  BlobServiceClient.from_connection_string(CONNECTIONSTRING)

    def upload_all_images_in_folder(self):
    # Get all files with jpg extension and exclude directories
        all_file_names = [f for f in os.listdir(LOCAL_IMAGE_PATH)
                        if os.path.isfile(os.path.join(LOCAL_IMAGE_PATH, f)) and ".jpg" in f]

    # Upload each file
    for file_name in all_file_names:
        self.upload_image(file_name)

    def upload_image(self,file_name):
    # Create blob with same name as local file name
        blob_client = self.blob_service_client.get_blob_client(container=MY_IMAGE_CONTAINER,
                                                              blob=file_name)
    # Get full path to the file
        upload_file_path = os.path.join(LOCAL_IMAGE_PATH, file_name)

    # Create blob on storage
    # Overwrite if it already exists!
    image_content_setting = ContentSettings(content_type='image/jpeg')
    print(f"uploading file - {file_name}")
    with open(upload_file_path, "rb") as data:
        blob_client.upload_blob(data,overwrite=True,content_settings=image_content_setting)


# Initialize class and upload files
azure_blob_file_uploader = AzureBlobFileUploader()
azure_blob_file_uploader.upload_all_images_in_folder()

TypeError: get() got an unexpected keyword argument 'dtype'