To clean and organize the imported simplemaps Data

# Import Packages

In [111]:
import os
import yaml
import datetime
import pandas as pd

# Load the configuration file
with open(r'C:/Users/Dev/Documents/Real Estate Data/config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Directories

In [112]:
# Imported Data Directory
raw_dir_cities = config['data']['simplemaps_data']['raw']['cities']
raw_dir_counties = config['data']['simplemaps_data']['raw']['counties']
raw_dir_neighborhoods = config['data']['simplemaps_data']['raw']['neighborhoods']


# Cleaned Data Directory
processed_dir_cities = config['data']['simplemaps_data']['processed']['cities']
processed_dir_counties = config['data']['simplemaps_data']['processed']['counties']
processed_dir_neighborhoods = config['data']['simplemaps_data']['processed']['neighborhoods']

# Files

In [113]:
# Imported Data 
raw_file_cities = 'uscities.csv'
raw_file_counties = 'uscounties.csv'
raw_file_neighborhoods = 'usneighborhoods.csv'
raw_file_zipcodes = 'uszips.csv'

# Cleaned Data
processed_cities = 'us_cities.csv'
processed_counties = 'us_counties.csv'
processed_neighborhoods = 'us_neighborhoods.csv'
processed_zipcodes = 'us_zipcodes.csv'

### Load Data Frames

In [114]:
file_path = os.path.join(raw_dir_cities, raw_file_cities)
df_cities = pd.read_csv(file_path, low_memory=False)

file_path = os.path.join(raw_dir_counties, raw_file_counties)
df_counties = pd.read_csv(file_path, low_memory=False)

file_path = os.path.join(raw_dir_neighborhoods, raw_file_neighborhoods)
df_neighborhoods = pd.read_csv(file_path, low_memory=False)

file_path = os.path.join(raw_dir_zipcodes, raw_file_zipcodes)
df_zipcodes = pd.read_csv(file_path, low_memory=False)

# Clean Data Process - Cities

In [115]:
df_cities.columns

Index(['city', 'city_ascii', 'state_id', 'state_name', 'county_fips',
       'county_name', 'lat', 'lng', 'population', 'density', 'source',
       'military', 'incorporated', 'timezone', 'ranking', 'zips', 'id'],
      dtype='object')

In [116]:
df = df_cities
columns_to_keep = [

    'city',
    #'city_ascii',
    'state_id',
    'state_name',
    #'county_fips',
    'county_name',
    #'lat',
    #'lng',
    #'population',
    #'density',
    #'source',
    #'military',
    #'incorporated',
    #'timezone',
    #'ranking',
    'zips',
    'id'


]

df[columns_to_keep]
df = df[columns_to_keep]

In [117]:
column_rename = {

    'city': 'city',
    #'city_ascii',
    'state_id': 'stateID',
    'state_name': 'state',
    #'county_fips',
    'county_name': 'county',
    #'lat': '',
    #'lng': '',
    #'population': 'population',
    #'density': 'density',
    #'source',
    #'military',
    #'incorporated',
    #'timezone',
    #'ranking',
    'zips': 'zipcode',
    'id': 'cityID',

}

# Rename columns
df = df.rename(columns=column_rename)

In [118]:
# Split the zipcodes into separate rows
df['zipcode'] = df['zipcode'].str.split()
df_exploded = df.explode('zipcode')

# Reset index if needed
df_exploded.reset_index(drop=True, inplace=True)
df_exploded

Unnamed: 0,city,stateID,state,county,zipcode,cityID
0,New York,NY,New York,Queens,11229,1840034016
1,New York,NY,New York,Queens,11228,1840034016
2,New York,NY,New York,Queens,11226,1840034016
3,New York,NY,New York,Queens,11225,1840034016
4,New York,NY,New York,Queens,11224,1840034016
...,...,...,...,...,...,...
48186,Kohatk,AZ,Arizona,Pinal,85634,1840022983
48187,Ironville,PA,Pennsylvania,Blair,16686,1840152922
48188,Newkirk,NM,New Mexico,Guadalupe,88417,1840024978
48189,Falcon Village,TX,Texas,Starr,78545,1840018314


In [119]:
df_cities = df_exploded

# Save File - Cities

In [120]:
# Define the file path
csv_file_path = os.path.join(processed_dir_cities, processed_cities)
df_cities.to_csv(csv_file_path, index=False)

# Clean Data Process - Counties

In [121]:
df_counties.columns

Index(['county', 'county_ascii', 'county_full', 'county_fips', 'state_id',
       'state_name', 'lat', 'lng', 'population'],
      dtype='object')

In [122]:
df = df_counties
columns_to_keep = [

    'county', 
    #'county_ascii',
    #'county_full',
    #'county_fips',
    #'state_id',
    'state_name',
    #'lat',
    #'lng',
    #'population'

]

df[columns_to_keep]
df = df[columns_to_keep]

In [123]:
column_rename = {

    'county': 'county', 
    #'county_ascii',
    #'county_full',
    #'county_fips',
    #'state_id',
    'state_name': 'state',
    #'lat',
    #'lng',
    #'population'

}

# Rename columns
df = df.rename(columns=column_rename)
df_counties = df

# Save File - Counties

In [124]:
# Define the file path
csv_file_path = os.path.join(processed_dir_counties, processed_counties)
df_counties.to_csv(csv_file_path, index=False)

# Clean Data Process - Neighborhoods

In [125]:
df_neighborhoods.columns

Index(['neighborhood', 'neighborhood_ascii', 'lat', 'lng', 'city_name',
       'city_id', 'state_name', 'state_id', 'source', 'timezone', 'zips',
       'county_fips', 'county_name', 'id'],
      dtype='object')

In [126]:
df_neighborhoods

Unnamed: 0,neighborhood,neighborhood_ascii,lat,lng,city_name,city_id,state_name,state_id,source,timezone,zips,county_fips,county_name,id
0,Atlanta University Center,Atlanta University Center,33.74947,-84.41125,Atlanta,1840013660,Georgia,GA,polygon,America/New_York,30314,13121,Fulton,184001366011100
1,Hunter Hills,Hunter Hills,33.75872,-84.43123,Atlanta,1840013660,Georgia,GA,polygon,America/New_York,30314,13121,Fulton,184001366011101
2,Bankhead,Bankhead,33.76901,-84.42452,Atlanta,1840013660,Georgia,GA,polygon,America/New_York,30318 30314,13121,Fulton,184001366011102
3,English Avenue,English Avenue,33.77070,-84.41099,Atlanta,1840013660,Georgia,GA,polygon,America/New_York,30318 30314,13121,Fulton,184001366011103
4,Adair Park,Adair Park,33.73117,-84.41083,Atlanta,1840013660,Georgia,GA,polygon,America/New_York,30310,13121,Fulton,184001366011104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3587,Buckhead,Buckhead,33.83955,-84.37937,Atlanta,1840013660,Georgia,GA,point,America/New_York,30305,13121,Fulton,184001366014125
3588,Blair Village,Blair Village,33.65761,-84.37020,Atlanta,1840013660,Georgia,GA,point,America/New_York,30354,13121,Fulton,184001366014126
3589,West Tampa,West Tampa,27.95335,-82.48926,Tampa,1840015982,Florida,FL,point,America/New_York,33607,12057,Hillsborough,184001598214105
3590,Terra Cotta,Terra Cotta,38.95372,-76.99831,Washington,1840006060,District of Columbia,DC,point,America/New_York,20011,11001,District of Columbia,184000606014149


In [127]:
df = df_neighborhoods
columns_to_keep = [

    'neighborhood',
    #'neighborhood_ascii',
    #'lat',
    #'lng',
    'city_name',
    'city_id',
    'state_name',
    'state_id',
    #'source',
    #'timezone',
    'zips',
    #'county_fips',
    'county_name',
    'id'

]

df[columns_to_keep]
df = df[columns_to_keep]

In [128]:
column_rename = {

    'neighborhood': 'neighborhood',
    #'neighborhood_ascii',
    #'lat',
    #'lng',
    'city_name': 'city',
    'city_id': 'cityID',
    'state_name': 'state',
    'state_id': "stateID",
    #'source',
    #'timezone',
    'zips': 'zipcode',
    #'county_fips',
    'county_name': 'county',
    'id': 'neighborhoodID',

}

# Rename columns
df = df.rename(columns=column_rename)

In [129]:
# Split the zipcodes into separate rows
df['zipcode'] = df['zipcode'].str.split()
df_exploded = df.explode('zipcode')

# Reset index if needed
df_exploded.reset_index(drop=True, inplace=True)
df_exploded

Unnamed: 0,neighborhood,city,cityID,state,stateID,zipcode,county,neighborhoodID
0,Atlanta University Center,Atlanta,1840013660,Georgia,GA,30314,Fulton,184001366011100
1,Hunter Hills,Atlanta,1840013660,Georgia,GA,30314,Fulton,184001366011101
2,Bankhead,Atlanta,1840013660,Georgia,GA,30318,Fulton,184001366011102
3,Bankhead,Atlanta,1840013660,Georgia,GA,30314,Fulton,184001366011102
4,English Avenue,Atlanta,1840013660,Georgia,GA,30318,Fulton,184001366011103
...,...,...,...,...,...,...,...,...
5455,Buckhead,Atlanta,1840013660,Georgia,GA,30305,Fulton,184001366014125
5456,Blair Village,Atlanta,1840013660,Georgia,GA,30354,Fulton,184001366014126
5457,West Tampa,Tampa,1840015982,Florida,FL,33607,Hillsborough,184001598214105
5458,Terra Cotta,Washington,1840006060,District of Columbia,DC,20011,District of Columbia,184000606014149


In [130]:
df_neighborhoods = df_exploded

# Save File - Neighborhoods

In [131]:
# Define the file path
csv_file_path = os.path.join(processed_dir_neighborhoods, cleaned_neighborhoods)
df_neighborhoods.to_csv(csv_file_path, index=False)