In [1]:
import math
import pandas as pd
import geopandas as gpd
from geopy.geocoders import Nominatim
# from learntools.geospatial.tools import Nominatim 


import folium 
from folium import Marker
from folium.plugins import MarkerCluster

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/ca-county-bounds/CA_county_boundaries.prj
/kaggle/input/ca-county-bounds/CA_county_boundaries.dbf
/kaggle/input/ca-county-bounds/CA_county_boundaries.shp
/kaggle/input/ca-county-bounds/CA_county_boundaries.shx
/kaggle/input/ca-county-bounds/CA_county_boundaries.cpg
/kaggle/input/ca-county-general/CA_county_median_age.csv
/kaggle/input/ca-county-general/CA_county_population.csv
/kaggle/input/ca-county-general/CA_county_high_earners.csv
/kaggle/input/starbucks-locations/starbucks_locations.csv


# Introduction

This project involves investigating the demographics of various counties in the state of California, to determine potentially suitable locations for Starbucks Reserve Roasteries.

In [2]:
def embed_map(m, file_name):
    from IPython.display import IFrame
    m.save(file_name)
    return IFrame(file_name, width='100%', height='500px')

In [3]:
# Load and preview Starbucks locations in California
starbucks = pd.read_csv('/kaggle/input/starbucks-locations/starbucks_locations.csv')
starbucks.head()

Unnamed: 0,Store Number,Store Name,Address,City,Longitude,Latitude
0,10429-100710,Palmdale & Hwy 395,14136 US Hwy 395 Adelanto CA,Adelanto,-117.4,34.51
1,635-352,Kanan & Thousand Oaks,5827 Kanan Road Agoura CA,Agoura,-118.76,34.16
2,74510-27669,Vons-Agoura Hills #2001,5671 Kanan Rd. Agoura Hills CA,Agoura Hills,-118.76,34.15
3,29839-255026,Target Anaheim T-0677,8148 E SANTA ANA CANYON ROAD AHAHEIM CA,AHAHEIM,-117.75,33.87
4,23463-230284,Safeway - Alameda 3281,2600 5th Street Alameda CA,Alameda,-122.28,37.79


In [4]:
print(starbucks.isnull().sum())

# View rows with missing locations
rows_with_missing = starbucks[starbucks["City"]=="Berkeley"]
rows_with_missing

Store Number    0
Store Name      0
Address         0
City            0
Longitude       5
Latitude        5
dtype: int64


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Store Number,Store Name,Address,City,Longitude,Latitude
153,5406-945,2224 Shattuck - Berkeley,2224 Shattuck Avenue Berkeley CA,Berkeley,,
154,570-512,Solano Ave,1799 Solano Avenue Berkeley CA,Berkeley,,
155,17877-164526,Safeway - Berkeley #691,1444 Shattuck Place Berkeley CA,Berkeley,,
156,19864-202264,Telegraph & Ashby,3001 Telegraph Avenue Berkeley CA,Berkeley,,
157,9217-9253,2128 Oxford St.,2128 Oxford Street Berkeley CA,Berkeley,,


In [5]:
import time

# Create the geocoder
geolocator = Nominatim(user_agent="kaggle_learn")


def geocoder(row):
    try:
        time.sleep(1)  # Important: prevent Nominatim throttling
        point = geolocator.geocode(row).point
        return pd.Series({'Latitude': point.latitude, 'Longitude': point.longitude})
    except:
        return None

berkeley_locations = rows_with_missing.apply(lambda x: geocoder(x['Address']), axis=1)
starbucks.update(berkeley_locations)
starbucks[starbucks["City"]=="Berkeley"]

Unnamed: 0,Store Number,Store Name,Address,City,Longitude,Latitude
153,5406-945,2224 Shattuck - Berkeley,2224 Shattuck Avenue Berkeley CA,Berkeley,-122.26823,37.868839
154,570-512,Solano Ave,1799 Solano Avenue Berkeley CA,Berkeley,-122.280013,37.891477
155,17877-164526,Safeway - Berkeley #691,1444 Shattuck Place Berkeley CA,Berkeley,-122.269679,37.880907
156,19864-202264,Telegraph & Ashby,3001 Telegraph Avenue Berkeley CA,Berkeley,-122.259406,37.855903
157,9217-9253,2128 Oxford St.,2128 Oxford Street Berkeley CA,Berkeley,-122.266095,37.870253


In [6]:
# Create a base map
m_2 = folium.Map(location=[37.88,-122.26], zoom_start=13)

# Your code here: Add a marker for each Berkeley location
for idx, row in starbucks[starbucks["City"]=='Berkeley'].iterrows():
    Marker([row['Latitude'], row['Longitude']]).add_to(m_2)

# Show the map
embed_map(m_2, 'q_2.html')

### Click to see the map
- [Berkeley Starbucks Locations](https://egekiratli.github.io/DS-ML/maps/berkeley-locations.html)


In [7]:
CA_counties = gpd.read_file("/kaggle/input/ca-county-bounds/CA_county_boundaries.shp")
CA_counties.crs = {'init': 'epsg:4326'}
CA_counties.head()

  in_crs_string = _prepare_from_proj_string(in_crs_string)


Unnamed: 0,GEOID,name,area_sqkm,geometry
0,6091,Sierra County,2491.995494,"POLYGON ((-120.65560 39.69357, -120.65554 39.6..."
1,6067,Sacramento County,2575.258262,"POLYGON ((-121.18858 38.71431, -121.18732 38.7..."
2,6083,Santa Barbara County,9813.817958,"MULTIPOLYGON (((-120.58191 34.09856, -120.5822..."
3,6009,Calaveras County,2685.626726,"POLYGON ((-120.63095 38.34111, -120.63058 38.3..."
4,6111,Ventura County,5719.321379,"MULTIPOLYGON (((-119.63631 33.27304, -119.6360..."


- `CA_pop` contains an estimate of the population of each county.
- `CA_high_earners` contains the number of households with an income of at least $150,000 per year.
- `CA_median_age` contains the median age for each county.

In [8]:
CA_pop = pd.read_csv("/kaggle/input/ca-county-general/CA_county_population.csv", index_col="GEOID")
CA_high_earners = pd.read_csv("/kaggle/input/ca-county-general/CA_county_high_earners.csv", index_col="GEOID")
CA_median_age = pd.read_csv("/kaggle/input/ca-county-general/CA_county_median_age.csv", index_col="GEOID")

In [9]:
# Join `CA_counties` with `CA_pop`, `CA_high_earners`, and `CA_median_age`.
cols_to_add = CA_pop.join([CA_high_earners, CA_median_age]).reset_index()
CA_stats = CA_counties.merge(cols_to_add, on="GEOID")

In [10]:
# Create a "density" column with the population density.
CA_stats["density"] = CA_stats["population"] / CA_stats["area_sqkm"]

Select counties where:
- there are at least 100,000 households making \$150,000 per year,
- the median age is less than 38.5, and
- the density of inhabitants is at least 285 (per square kilometer).

Additionally, selected counties should satisfy at least one of the following criteria:
- there are at least 500,000 households making \$150,000 per year,
- the median age is less than 35.5, or
- the density of inhabitants is at least 1400 (per square kilometer).

In [11]:
sel_counties = sel_counties = CA_stats[((CA_stats.high_earners > 100000) &
                         (CA_stats.median_age < 38.5) &
                         (CA_stats.density > 285) &
                         ((CA_stats.median_age < 35.5) |
                         (CA_stats.density > 1400) |
                         (CA_stats.high_earners > 500000)))]

In [12]:
starbucks_gdf = gpd.GeoDataFrame(starbucks, geometry=gpd.points_from_xy(starbucks.Longitude, starbucks.Latitude))
starbucks_gdf.crs = {'init': 'epsg:4326'}

  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [13]:
locations_of_interest = gpd.sjoin(starbucks_gdf, sel_counties)
num_stores = len(locations_of_interest)

### Visualize the possible locations for the New store

In [14]:
# Create a base map
m_6 = folium.Map(location=[37,-120], zoom_start=6)

# Show selected store locations
mc = MarkerCluster()

locations_of_interest = gpd.sjoin(starbucks_gdf, sel_counties)
for idx, row in locations_of_interest.iterrows():
    if not math.isnan(row['Longitude']) and not math.isnan(row['Latitude']):
        mc.add_child(folium.Marker([row['Latitude'], row['Longitude']]))
        
m_6.add_child(mc)

# Show the map
embed_map(m_6, 'q_6.html')

### Click to see the map
- [Possible New Locations](https://egekiratli.github.io/DS-ML/maps/possible-locations.html)


In [15]:
m_2.save("berkeley-locations.html")
m_6.save("possible-locations.html")