In [1]:
import folium
import geopandas as gpd
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from folium import plugins
from shapely.geometry import Point

from src.config import DADOS_LIMPOS, DADOS_GEO_ORIGINAIS, DADOS_GEO_MEDIAN
from src.graficos import PALETTE, SCATTER_ALPHA

sns.set_theme(palette="bright", style='dark')

In [2]:
df = pd.read_parquet(DADOS_LIMPOS)

df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,median_income_cat,rooms_per_household,population_per_household,bedrooms_per_room,population_per_room
0,-122.23,37.88,41,880,129,322,126,8.3252,452600,NEAR BAY,5,6.984127,2.555556,0.146591,0.365909
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,358500,NEAR BAY,5,6.238137,2.109842,0.155797,0.338217
2,-122.26,37.84,42,2555,665,1206,595,2.0804,226700,NEAR BAY,2,4.294118,2.026891,0.260274,0.472016
3,-122.26,37.85,50,1120,283,697,264,2.125,140000,NEAR BAY,2,4.242424,2.640152,0.252679,0.622321
4,-122.26,37.84,50,2239,455,990,419,1.9911,158700,NEAR BAY,2,5.343675,2.362768,0.203216,0.442162


In [3]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 17515 entries, 0 to 17514
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   longitude                 17515 non-null  float64 
 1   latitude                  17515 non-null  float64 
 2   housing_median_age        17515 non-null  int8    
 3   total_rooms               17515 non-null  int16   
 4   total_bedrooms            17515 non-null  int16   
 5   population                17515 non-null  int16   
 6   households                17515 non-null  int16   
 7   median_income             17515 non-null  float64 
 8   median_house_value        17515 non-null  int32   
 9   ocean_proximity           17515 non-null  category
 10  median_income_cat         17515 non-null  int64   
 11  rooms_per_household       17515 non-null  float64 
 12  population_per_household  17515 non-null  float64 
 13  bedrooms_per_room         17515 non-null  float64 
 14  p

In [4]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,median_income_cat,rooms_per_household,population_per_household,bedrooms_per_room,population_per_room
count,17515.0,17515.0,17515.0,17515.0,17515.0,17515.0,17515.0,17515.0,17515.0,17515.0,17515.0,17515.0,17515.0,17515.0
mean,-119.513273,35.606241,27.266115,2514.896489,515.906537,1385.809877,484.407194,3.700456,189688.061547,2.954953,5.260755,2.929839,0.211285,0.5846
std,1.985391,2.14838,11.324505,1593.711025,320.946868,844.291144,295.661729,1.54181,96037.570708,1.006643,1.128166,0.685911,0.047841,0.206795
min,-124.3,32.54,1.0,6.0,2.0,3.0,2.0,0.4999,14999.0,1.0,1.714286,0.75,0.113535,0.166667
25%,-121.605,33.92,18.0,1468.0,300.0,816.0,287.0,2.5523,114300.0,2.0,4.455787,2.46941,0.17686,0.447914
50%,-118.47,34.25,28.0,2136.0,438.0,1195.0,415.0,3.4838,171700.0,3.0,5.210526,2.850365,0.20335,0.523885
75%,-117.98,37.68,36.0,3128.0,643.0,1737.0,605.0,4.625,243900.0,4.0,5.963796,3.30162,0.23861,0.659836
max,-114.55,41.95,51.0,11026.0,2205.0,5804.0,1979.0,10.5941,500000.0,5.0,10.352941,5.392384,0.406295,1.529833


In [5]:
gdf_counties = gpd.read_file(DADOS_GEO_ORIGINAIS)

gdf_counties.head()

Unnamed: 0,name,fullname,abbrev,abcode,ansi,geometry
0,Siskiyou,Siskiyou County,SIS,c047,93,"POLYGON ((-121.44597 41.1839, -121.46238 41.18..."
1,Del Norte,Del Norte County,DNT,c008,15,"MULTIPOLYGON (((-124.19097 41.73619, -124.1933..."
2,Modoc,Modoc County,MOD,c025,49,"POLYGON ((-121.44784 41.99739, -121.42322 41.9..."
3,Trinity,Trinity County,TRI,c053,105,"POLYGON ((-122.66795 40.77637, -122.66814 40.7..."
4,Shasta,Shasta County,SHA,c045,89,"POLYGON ((-122.17328 40.37906, -122.17359 40.3..."


In [6]:
#criando a informação de geometria usando a longitute e latitude para utilizar no geopandas

geometry = gpd.points_from_xy(df['longitude'], df['latitude'])
gdf = gpd.GeoDataFrame(df, geometry=geometry)

In [7]:
gdf.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,median_income_cat,rooms_per_household,population_per_household,bedrooms_per_room,population_per_room,geometry
0,-122.23,37.88,41,880,129,322,126,8.3252,452600,NEAR BAY,5,6.984127,2.555556,0.146591,0.365909,POINT (-122.23 37.88)
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,358500,NEAR BAY,5,6.238137,2.109842,0.155797,0.338217,POINT (-122.22 37.86)
2,-122.26,37.84,42,2555,665,1206,595,2.0804,226700,NEAR BAY,2,4.294118,2.026891,0.260274,0.472016,POINT (-122.26 37.84)
3,-122.26,37.85,50,1120,283,697,264,2.125,140000,NEAR BAY,2,4.242424,2.640152,0.252679,0.622321,POINT (-122.26 37.85)
4,-122.26,37.84,50,2239,455,990,419,1.9911,158700,NEAR BAY,2,5.343675,2.362768,0.203216,0.442162,POINT (-122.26 37.84)


In [10]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 17515 entries, 0 to 17514
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   longitude                 17515 non-null  float64 
 1   latitude                  17515 non-null  float64 
 2   housing_median_age        17515 non-null  int8    
 3   total_rooms               17515 non-null  int16   
 4   total_bedrooms            17515 non-null  int16   
 5   population                17515 non-null  int16   
 6   households                17515 non-null  int16   
 7   median_income             17515 non-null  float64 
 8   median_house_value        17515 non-null  int32   
 9   ocean_proximity           17515 non-null  category
 10  median_income_cat         17515 non-null  int64   
 11  rooms_per_household       17515 non-null  float64 
 12  population_per_household  17515 non-null  float64 
 13  bedrooms_per_room         17515 non-nu

In [8]:
gdf.crs = "EPSG:4326"

In [14]:
gdf_counties = gdf_counties.set_crs(epsg=4326)

In [9]:
gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [15]:
gdf_counties.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [16]:
gdf_counties.head()

Unnamed: 0,name,fullname,abbrev,abcode,ansi,geometry
0,Siskiyou,Siskiyou County,SIS,c047,93,"POLYGON ((-121.44597 41.1839, -121.46238 41.18..."
1,Del Norte,Del Norte County,DNT,c008,15,"MULTIPOLYGON (((-124.19097 41.73619, -124.1933..."
2,Modoc,Modoc County,MOD,c025,49,"POLYGON ((-121.44784 41.99739, -121.42322 41.9..."
3,Trinity,Trinity County,TRI,c053,105,"POLYGON ((-122.66795 40.77637, -122.66814 40.7..."
4,Shasta,Shasta County,SHA,c045,89,"POLYGON ((-122.17328 40.37906, -122.17359 40.3..."


In [20]:
#Unindo os dataframes para obter a informação de Condado de cada bloco de domicilio no nosso Dataframe
gdf_joined = gpd.sjoin(gdf, gdf_counties, how='left', predicate = 'within')

gdf_joined.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,...,population_per_household,bedrooms_per_room,population_per_room,geometry,index_right,name,fullname,abbrev,abcode,ansi
0,-122.23,37.88,41,880,129,322,126,8.3252,452600,NEAR BAY,...,2.555556,0.146591,0.365909,POINT (-122.23 37.88),34.0,Alameda,Alameda County,ALA,c001,1
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,358500,NEAR BAY,...,2.109842,0.155797,0.338217,POINT (-122.22 37.86),34.0,Alameda,Alameda County,ALA,c001,1
2,-122.26,37.84,42,2555,665,1206,595,2.0804,226700,NEAR BAY,...,2.026891,0.260274,0.472016,POINT (-122.26 37.84),34.0,Alameda,Alameda County,ALA,c001,1
3,-122.26,37.85,50,1120,283,697,264,2.125,140000,NEAR BAY,...,2.640152,0.252679,0.622321,POINT (-122.26 37.85),34.0,Alameda,Alameda County,ALA,c001,1
4,-122.26,37.84,50,2239,455,990,419,1.9911,158700,NEAR BAY,...,2.362768,0.203216,0.442162,POINT (-122.26 37.84),34.0,Alameda,Alameda County,ALA,c001,1


In [21]:
gdf_joined = gdf_joined.drop(
    columns = ['index_right','fullname','abcode','ansi']
)

gdf_joined.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 17515 entries, 0 to 17514
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   longitude                 17515 non-null  float64 
 1   latitude                  17515 non-null  float64 
 2   housing_median_age        17515 non-null  int8    
 3   total_rooms               17515 non-null  int16   
 4   total_bedrooms            17515 non-null  int16   
 5   population                17515 non-null  int16   
 6   households                17515 non-null  int16   
 7   median_income             17515 non-null  float64 
 8   median_house_value        17515 non-null  int32   
 9   ocean_proximity           17515 non-null  category
 10  median_income_cat         17515 non-null  int64   
 11  rooms_per_household       17515 non-null  float64 
 12  population_per_household  17515 non-null  float64 
 13  bedrooms_per_room         17515 non-nu

In [26]:
#Condados não encontrados
gdf_joined[gdf_joined.isnull().any(axis=1)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,median_income_cat,rooms_per_household,population_per_household,bedrooms_per_room,population_per_room,geometry,name,abbrev
1507,-124.30,41.80,19,2672,552,1298,478,1.9797,85800,NEAR OCEAN,2,5.589958,2.715481,0.206587,0.485778,POINT (-124.3 41.8),,
1508,-124.23,41.75,11,3159,616,1343,479,2.4805,73200,NEAR OCEAN,2,6.594990,2.803758,0.194998,0.425135,POINT (-124.23 41.75),,
1511,-124.22,41.73,28,3003,699,1530,653,1.7038,78300,NEAR OCEAN,2,4.598775,2.343032,0.232767,0.509491,POINT (-124.22 41.73),,
1517,-124.30,41.84,17,2677,531,1244,456,3.0313,103600,NEAR OCEAN,3,5.870614,2.728070,0.198356,0.464699,POINT (-124.3 41.84),,
1521,-119.94,38.96,20,1451,386,467,255,1.5536,212500,INLAND,2,5.690196,1.831373,0.266023,0.321847,POINT (-119.94 38.96),,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17150,-119.30,34.27,17,1527,503,688,423,1.6007,187500,NEAR OCEAN,2,3.609929,1.626478,0.329404,0.450557,POINT (-119.3 34.27),,
17151,-119.29,34.26,32,3295,764,1344,600,3.6007,395500,NEAR OCEAN,3,5.491667,2.240000,0.231866,0.407891,POINT (-119.29 34.26),,
17153,-119.29,34.23,22,2486,608,709,523,2.9018,275000,NEAR OCEAN,2,4.753346,1.355641,0.244570,0.285197,POINT (-119.29 34.23),,
17183,-119.23,34.15,18,6213,1188,2679,1000,3.7480,380400,NEAR OCEAN,3,6.213000,2.679000,0.191212,0.431193,POINT (-119.23 34.15),,


In [29]:
linhas_faltantes = gdf_joined[gdf_joined.isnull().any(axis=1)].index
linhas_faltantes

Index([ 1507,  1508,  1511,  1517,  1521,  2154,  2193,  2201,  2203,  2219,
       ...
       15699, 15700, 15704, 15707, 15708, 17150, 17151, 17153, 17183, 17209],
      dtype='int64', length=148)