{DESCRIPTION OF FILE}

# Import Packages

In [76]:
import os
import yaml
import datetime
import pandas as pd

# Load the configuration file
with open(r'C:/Users/Dev/Documents/Real Estate Data/config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Directories

In [77]:
# Imported Data Directory
cities_processed_dir = config['data']['simplemaps_data']['processed']['cities']
counties_processed_dir = config['data']['simplemaps_data']['processed']['counties']
neighborhoods_processed_dir = config['data']['simplemaps_data']['processed']['neighborhoods']
raw_zillow_area_dir = config['data']['zillow_data']['raw']['area']['Other']

# Cleaned Data Directory
master_dir = config['data']['simplemaps_data']['master']

# Files

In [78]:
# Imported Data 
cities_file = 'us_cities.csv'
counties_file = 'us_counties.csv'
neighborhoods_file = 'us_neighborhoods.csv'
zillow_area = 'zillow state, metro, city, zipcode data.csv'

# Cleaned Data
master_file = 'simplemaps_master_data.csv'

# Import Files

In [79]:
file_path = os.path.join(cities_processed_dir, cities_file)
cities = pd.read_csv(file_path, low_memory=False)

file_path = os.path.join(counties_processed_dir, counties_file)
counties = pd.read_csv(file_path, low_memory=False)

file_path = os.path.join(neighborhoods_processed_dir, neighborhoods_file)
neighborhoods = pd.read_csv(file_path, low_memory=False)

file_path = os.path.join(raw_zillow_area_dir, zillow_area)
zillow = pd.read_csv(file_path, low_memory=False)

### Load Data Frame

In [80]:
print(cities.columns)
print(counties.columns)
print(neighborhoods.columns)

Index(['city', 'stateID', 'state', 'county', 'zipcode', 'cityID'], dtype='object')
Index(['county', 'state'], dtype='object')
Index(['neighborhood', 'city', 'cityID', 'state', 'stateID', 'zipcode',
       'county', 'neighborhoodID'],
      dtype='object')


## Metro

In [81]:
# Modify the 'Metro' column
zillow['Metro'] = zillow['Metro'].str.split(',').str[0]

# Modify the 'Metro' column
zillow['Metro'] = zillow['Metro'].str.split(',').str[0]

metro_mapping = zillow.set_index('City')['Metro'].to_dict()

# Add the 'metro' column to the 'cities' DataFrame
cities['metro'] = cities['city'].map(metro_mapping)

# Clean Data Process

In [82]:
master = pd.concat([cities, counties, neighborhoods], axis=0, ignore_index=True).drop_duplicates()

In [83]:
# Reorder the columns
master = master[['stateID', 'state', 'metro', 'county', 'city', 'zipcode', 'neighborhood']]
master

Unnamed: 0,stateID,state,metro,county,city,zipcode,neighborhood
0,NY,New York,New York-Newark-Jersey City,Queens,New York,11229.0,
1,NY,New York,New York-Newark-Jersey City,Queens,New York,11228.0,
2,NY,New York,New York-Newark-Jersey City,Queens,New York,11226.0,
3,NY,New York,New York-Newark-Jersey City,Queens,New York,11225.0,
4,NY,New York,New York-Newark-Jersey City,Queens,New York,11224.0,
...,...,...,...,...,...,...,...
56790,GA,Georgia,,Fulton,Atlanta,30305.0,Buckhead
56791,GA,Georgia,,Fulton,Atlanta,30354.0,Blair Village
56792,FL,Florida,,Hillsborough,Tampa,33607.0,West Tampa
56793,DC,District of Columbia,,District of Columbia,Washington,20011.0,Terra Cotta


# Save File

In [84]:
# Define the file path
csv_file_path = os.path.join(master_dir, master_file)
master.to_csv(csv_file_path, index=False)