In [1]:
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner

In [116]:
#Import the file from the html site
df = pd.read_html("https://www.seechicagorealestate.com/chicago-zip-codes-by-neighborhood.php")[0]


In [117]:
#Add new columns
df['Latitude'] = ''
df['Longitude'] = ''

#Add headers
my_columns = ["Neighborhood", "Zip", "Latitude", "Longitude"]
df.columns = my_columns

df.head()

Unnamed: 0,Neighborhood,Zip,Latitude,Longitude
0,Downtown,Zip Code,,
1,Cathedral District,60611,,
2,Central Station,60605,,
3,Dearborn Park,60605,,
4,Gold Coast,"60610, 60611",,


In [118]:
#Drop areas outside of Chicago - all zips after North Shore
NorthShoreIndex = df[df['Neighborhood']=='North Shore'].index.item()
df = df[:NorthShoreIndex]

#Drop all nan rows
df = df.dropna()

df.head(30)


Unnamed: 0,Neighborhood,Zip,Latitude,Longitude
0,Downtown,Zip Code,,
1,Cathedral District,60611,,
2,Central Station,60605,,
3,Dearborn Park,60605,,
4,Gold Coast,"60610, 60611",,
5,Loop,"60601, 60602, 60603, 60604, 60605, 60606, 6060...",,
6,Magnificent Mile,60611,,
7,Museum Campus,60605,,
8,Near North Side,"60610, 60611, 60642, 60654",,
9,Near West Side,"60606, 60607, 60608, 60610, 60612, 60661",,


In [119]:
#Drop rows that are category group headers
df = df[~df['Zip'].isin(['Zip Code'])]
df.reset_index(drop=True, inplace=True)

df.head(30)



Unnamed: 0,Neighborhood,Zip,Latitude,Longitude
0,Cathedral District,60611,,
1,Central Station,60605,,
2,Dearborn Park,60605,,
3,Gold Coast,"60610, 60611",,
4,Loop,"60601, 60602, 60603, 60604, 60605, 60606, 6060...",,
5,Magnificent Mile,60611,,
6,Museum Campus,60605,,
7,Near North Side,"60610, 60611, 60642, 60654",,
8,Near West Side,"60606, 60607, 60608, 60610, 60612, 60661",,
9,New East Side,60601,,


In [103]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


In [135]:
#Loop through df and set latitude and longitude
geolocator = Nominatim(user_agent="chicago_zips")
for index, row in df.iterrows():
    zips = df.loc[index,'Zip']
    zipList = zips.split(", ")
    #print(zipList)
    
    latitudeTotal = 0
    longitudeTotal = 0
    counter = 0
    for neighborhoodZip in zipList:
        #print(neighborhoodZip)
        zip = '60605'

        location = geolocator.geocode(neighborhoodZip)
        latitude = location.latitude
        longitude = location.longitude
        #print('The geograpical coordinate of {} are {}, {}.'.format(df.loc[index,'Neighborhood'],latitude, longitude))

        counter = counter + 1
        latitudeTotal = latitudeTotal + latitude
        longitudeTotal = longitudeTotal + longitude
    latitudeTotal = latitudeTotal / counter
    longitudeTotal = longitudeTotal / counter
    
    #Set the latitude and longitude in the df
    df.loc[index,'Latitude'] = latitudeTotal
    df.loc[index,'Longitude'] = longitudeTotal
    
df.head()

The geograpical coordinate of Cathedral District are 41.9013755, -87.623854.
The geograpical coordinate of Central Station are 41.8659973, -87.6081513.
The geograpical coordinate of Dearborn Park are 41.8659973, -87.6081513.
The geograpical coordinate of Gold Coast are 41.9038725257493, -87.6385957471562.
The geograpical coordinate of Gold Coast are 41.9013755, -87.623854.
The geograpical coordinate of Loop are 9.42030540980843, -84.1550425023166.
The geograpical coordinate of Loop are 41.8828718344361, -87.6291188024761.
The geograpical coordinate of Loop are 41.8811082918444, -87.6283114289812.
The geograpical coordinate of Loop are 41.8784178587146, -87.627759574011.
The geograpical coordinate of Loop are 41.8659973, -87.6081513.
The geograpical coordinate of Loop are 41.8828394635735, -87.6365272935001.
The geograpical coordinate of Loop are 41.8780630922049, -87.6515362099198.
The geograpical coordinate of Loop are 41.847864064102, -87.6306229996288.
The geograpical coordinate of 

Unnamed: 0,Neighborhood,Zip,Latitude,Longitude
0,Cathedral District,60611,41.9014,-87.6239
1,Central Station,60605,41.866,-87.6082
2,Dearborn Park,60605,41.866,-87.6082
3,Gold Coast,"60610, 60611",41.9026,-87.6312
4,Loop,"60601, 60602, 60603, 60604, 60605, 60606, 6060...",37.8172,-87.1959


In [120]:
df.shape

(77, 4)

In [319]:
#Import the file from the html site
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M",header=0)[0]

#Remove rows that are not assigned
df = df[df.Borough != 'Not assigned']

#Find Duplicated rows
df_non_dups = df.drop_duplicates(['Postcode','Borough'], keep=False)
df_dups = df[df.duplicated(['Postcode','Borough'], keep=False)]

#Create blank dataframe to store de-duped values
df_dedup = pd.DataFrame(columns=['Postcode','Borough','Neighbourhood'])

#Get values of first row in dups
updateIndex = df_dups.index[0]
currentPostcode = df_dups.iloc[0,0]
currentBorough = df_dups.iloc[0,1]
updateNeighborhood = ''
concatString = ''

#Loop through the dups and create update dataframe with concatinated names
for index, row in df_dups.iterrows():
  if row['Postcode'] == currentPostcode:
    updateNeighborhood = updateNeighborhood + concatString + row['Neighbourhood'] 
    concatString = ', '
  else:  
    df_dedup.loc[updateIndex] = np.array([currentPostcode, currentBorough, updateNeighborhood])
    
    updateIndex = index
    currentPostcode = row['Postcode']
    currentBorough = row['Borough']
    updateNeighborhood = row['Neighbourhood'] 

#Add the last row
df_dedup.loc[updateIndex] = np.array([currentPostcode, currentBorough, updateNeighborhood])


#Combine data frames to get full list
df_non_dups = df_non_dups.append(df_dedup)

#Update Neighbourhoods that are not assigned
df_non_dups.Neighbourhood.replace('Not assigned',df_non_dups.Borough,inplace=True)


df_non_dups.shape

df_non_dups.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
14,M3B,North York,Don Mills North


In [338]:
#Get the latitude and longitude table
df = pd.read_csv("http://cocl.us/Geospatial_data")

#Update the index
df = df.rename(columns={'Postal Code': 'Postcode'})
df.set_index('Postcode', inplace = True)

df.head()

Unnamed: 0_level_0,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [347]:
#Assign df2 to existing df_non_dups
df2 = df_non_dups

#Add new columns
df2['Latitude'] = ''
df2['Longitude'] = ''

#Set the latitude and longitude
for index, row in df2.iterrows():
  df2.loc[index].Latitude = df.loc[row['Postcode']].Latitude
  df2.loc[index].Longitude = df.loc[row['Postcode']].Longitude

df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.7533,-79.3297
3,M4A,North York,Victoria Village,43.7259,-79.3156
8,M7A,Queen's Park,Queen's Park,43.6623,-79.3895
10,M9A,Etobicoke,Islington Avenue,43.6679,-79.5322
14,M3B,North York,Don Mills North,43.7459,-79.3522


## Explore Dataset

In [350]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /Users/coxda/anaconda3

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-2.2.2               |        py36_1001         493 KB  conda-forge
    branca-0.3.0               |             py_0          24 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         1.2 MB

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py36_1001 conda-forge
    branca:  0.3.0-py_0      conda-forge
    folium:  0.5.0-py_0      conda-forge
    vincent: 0.4.4-py_1

ImportError: No module named 'folium'

In [353]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Solving environment: done

# All requested packages already installed.



ImportError: No module named 'geopy'

In [352]:
address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

NameError: name 'Nominatim' is not defined

In [348]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_toronto

NameError: name 'folium' is not defined