## Packages

In [1]:
import pandas as pd
import numpy as np
import gdown
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

## Dependencies

In [150]:
gdown.download(id="122b3O7GoH2pouBcx411M1h0qegZytNpo")

Downloading...
From: https://drive.google.com/uc?id=122b3O7GoH2pouBcx411M1h0qegZytNpo
To: C:\Users\guimi\Documents\CloudProjects\indicium_desafio\pipeline\teste_indicium_precificacao.csv
100%|██████████| 7.08M/7.08M [00:01<00:00, 4.43MB/s]


'teste_indicium_precificacao.csv'

In [2]:
df = pd.read_csv("../assets/teste_indicium_precificacao.csv")

## EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48894 entries, 0 to 48893
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             48894 non-null  int64  
 1   nome                           48878 non-null  object 
 2   host_id                        48894 non-null  int64  
 3   host_name                      48873 non-null  object 
 4   bairro_group                   48894 non-null  object 
 5   bairro                         48894 non-null  object 
 6   latitude                       48894 non-null  float64
 7   longitude                      48894 non-null  float64
 8   room_type                      48894 non-null  object 
 9   price                          48894 non-null  int64  
 10  minimo_noites                  48894 non-null  int64  
 11  numero_de_reviews              48894 non-null  int64  
 12  ultima_review                  38842 non-null 

In [4]:
df.describe().loc[['min', 'mean', 'max'], :]

Unnamed: 0,id,host_id,latitude,longitude,price,minimo_noites,numero_de_reviews,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
min,2595.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
mean,19017530.0,67621390.0,40.728951,-73.952169,152.720763,7.030085,23.274758,1.373251,7.144005,112.776169
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


### Preprocessing dataset

In [5]:
df_pp = df.copy()
df_pp['ultima_review_timestamp'] = df_pp['ultima_review'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').timestamp() if pd.notnull(x) else None)
df_pp['reviews_por_mes'] = df_pp['reviews_por_mes'].fillna(0)
df_pp['ultima_review'] = df_pp['ultima_review'].fillna(0)
df_pp['ultima_review_timestamp'] = df_pp['ultima_review_timestamp'].fillna(0)
df_pp = df_pp.dropna(subset=['host_id', 'host_name', 'nome'])
df_pp

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365,ultima_review_timestamp
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,1.558408e+09
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,0,0.00,1,365,0.000000e+00
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,1.562296e+09
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0,1.542596e+09
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,1.561172e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48889,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,0,0.00,2,9,0.000000e+00
48890,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,0,0.00,2,36,0.000000e+00
48891,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,0,0.00,1,27,0.000000e+00
48892,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,0,0.00,6,2,0.000000e+00


In [6]:
df_pp = pd.get_dummies(df_pp, columns=['bairro_group'], prefix=['bairro_group_type'], dtype=int)
df_pp = pd.get_dummies(df_pp, columns=['room_type'], prefix=['room_type_type'], dtype=int)
df_pp

Unnamed: 0,id,nome,host_id,host_name,bairro,latitude,longitude,price,minimo_noites,numero_de_reviews,...,disponibilidade_365,ultima_review_timestamp,bairro_group_type_Bronx,bairro_group_type_Brooklyn,bairro_group_type_Manhattan,bairro_group_type_Queens,bairro_group_type_Staten Island,room_type_type_Entire home/apt,room_type_type_Private room,room_type_type_Shared room
0,2595,Skylit Midtown Castle,2845,Jennifer,Midtown,40.75362,-73.98377,225,1,45,...,355,1.558408e+09,0,0,1,0,0,1,0,0
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Harlem,40.80902,-73.94190,150,3,0,...,365,0.000000e+00,0,0,1,0,0,0,1,0
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Clinton Hill,40.68514,-73.95976,89,1,270,...,194,1.562296e+09,0,1,0,0,0,1,0,0
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,East Harlem,40.79851,-73.94399,80,10,9,...,0,1.542596e+09,0,0,1,0,0,1,0,0
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Murray Hill,40.74767,-73.97500,200,3,74,...,129,1.561172e+09,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48889,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Bedford-Stuyvesant,40.67853,-73.94995,70,2,0,...,9,0.000000e+00,0,1,0,0,0,0,1,0
48890,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Bushwick,40.70184,-73.93317,40,4,0,...,36,0.000000e+00,0,1,0,0,0,0,1,0
48891,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Harlem,40.81475,-73.94867,115,10,0,...,27,0.000000e+00,0,0,1,0,0,1,0,0
48892,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Hell's Kitchen,40.75751,-73.99112,55,1,0,...,2,0.000000e+00,0,0,1,0,0,0,0,1


- Notes:
Considerando o objetivo dado de prever o preço de um alguel de nova york, um primeiro passo importante é criar a hipótese de colunas importantes para essa predição. Descrição e Correlação dos dados pode nos ajudar aqui:

In [8]:
### Análises
df_quantitativo = df_pp.select_dtypes(exclude=["object"])
df_quantitativo =df_quantitativo.astype('float')
mms = MinMaxScaler()
df_quantitativo.iloc[:, 2:] = mms.fit_transform(df_quantitativo.iloc[:, 2:])
df_quantitativo.iloc[:, 2:].corr()

Unnamed: 0,latitude,longitude,price,minimo_noites,numero_de_reviews,reviews_por_mes,calculado_host_listings_count,disponibilidade_365,ultima_review_timestamp,bairro_group_type_Bronx,bairro_group_type_Brooklyn,bairro_group_type_Manhattan,bairro_group_type_Queens,bairro_group_type_Staten Island,room_type_type_Entire home/apt,room_type_type_Private room,room_type_type_Shared room
latitude,1.0,0.084808,0.033945,0.025885,-0.015208,-0.018719,0.019547,-0.010717,-0.030239,0.33052,-0.672879,0.590705,0.017147,-0.19097,-0.005604,0.004347,0.00417
longitude,0.084808,1.0,-0.149954,-0.062896,0.059148,0.138755,-0.114746,0.082774,0.048343,0.221304,0.017637,-0.433449,0.622767,-0.291936,-0.192071,0.183595,0.029639
price,0.033945,-0.149954,1.0,0.042804,-0.047949,-0.050586,0.05746,0.081821,-0.085723,-0.041024,-0.098546,0.16392,-0.080199,-0.013848,0.25578,-0.240164,-0.053639
minimo_noites,0.025885,-0.062896,0.042804,1.0,-0.081612,-0.127382,0.131313,0.145971,-0.114253,-0.018462,-0.039845,0.06796,-0.033095,-0.009558,0.075738,-0.074699,-0.004149
numero_de_reviews,-0.015208,0.059148,-0.047949,-0.081612,1.0,0.58922,-0.072408,0.171874,0.279248,0.009303,0.017413,-0.045857,0.035995,0.015096,-0.010231,0.01738,-0.0233
reviews_por_mes,-0.018719,0.138755,-0.050586,-0.127382,0.58922,1.0,-0.047366,0.16378,0.368092,0.036499,-0.021407,-0.063918,0.108003,0.026646,-0.02959,0.030183,-0.001646
calculado_host_listings_count,0.019547,-0.114746,0.05746,0.131313,-0.072408,-0.047366,1.0,0.225794,-0.116234,-0.0225,-0.12328,0.15291,-0.033913,-0.01285,0.112214,-0.108966,-0.011738
disponibilidade_365,-0.010717,0.082774,0.081821,0.145971,0.171874,0.16378,0.225794,1.0,0.04545,0.0607,-0.079835,-0.005305,0.0872,0.057904,-0.006973,-0.010748,0.058073
ultima_review_timestamp,-0.030239,0.048343,-0.085723,-0.114253,0.279248,0.368092,-0.116234,0.04545,1.0,0.005775,0.048235,-0.06139,0.015166,0.012102,0.014332,-0.007038,-0.024015
bairro_group_type_Bronx,0.33052,0.221304,-0.041024,-0.018462,0.009303,0.036499,-0.0225,0.0607,0.005775,1.0,-0.126169,-0.13465,-0.054676,-0.013243,-0.052172,0.043098,0.030217


In [9]:
df_pp.to_csv('df_pp.csv', index=False)


- quanto menor a longitude, maior o preço do aluguel
- ordem negativa de correlação (room_type_private_room [-0.24] ->longitude [-0.15] bairro_group_type_Brooklyn [-0.09] -> numero_de_reviews -> reviews_por_mes)
- ordem positiva de correlação (room_type_type_Entire home/apt [0.25] bairro_group_type_Manhattan [0.16] disponibilidade_365 [0.08] -> calculado_host_listings_count -> minimo_noites)


## Perguntas

### Pergunta a) Supondo que uma pessoa esteja pensando em investir em um apartamento para alugar na plataforma, onde seria mais indicada a compra? 
- Quer conforto
- Quer melhor custo e beneficio
- Quer passar 1 noite ou 7 noites?

In [165]:
px.histogram(df, 'bairro_group')

In [11]:
px.histogram(df, x="room_type")

In [None]:
px.histogram(df, 'bairro_group', color="room_type")

In [167]:
print("Distribuição de imóveis por grupos:")
print(df['bairro_group'].value_counts())
print("\nDistribuição de imóveis por tipos de quarto:")
print(df['room_type'].value_counts())


Distribuição de imóveis por grupos:
bairro_group
Manhattan        21661
Brooklyn         20103
Queens            5666
Bronx             1091
Staten Island      373
Name: count, dtype: int64

Distribuição de imóveis por tipos de quarto:
room_type
Entire home/apt    25409
Private room       22325
Shared room         1160
Name: count, dtype: int64


### Pergunta b) O número mínimo de noites e a disponibilidade ao longo do ano interferem no preço? 

In [ ]:
# número minimo de noites
# disponibilidade ao longo do ano

### Pergunta c) Existe algum padrão no texto do nome do local para lugares de mais alto valor?

R: bairro_group x price

In [16]:
# px.bar(df, x="bairro_group", y="price")
df['price_interval'] = pd.cut(df['price'], bins=10)
df 

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365,price_interval
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,"(-10.0, 1000.0]"
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365,"(-10.0, 1000.0]"
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,"(-10.0, 1000.0]"
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0,"(-10.0, 1000.0]"
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,"(-10.0, 1000.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48889,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9,"(-10.0, 1000.0]"
48890,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36,"(-10.0, 1000.0]"
48891,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27,"(-10.0, 1000.0]"
48892,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2,"(-10.0, 1000.0]"


In [17]:
df.groupby('price_interval')['price'].count()





price_interval
(-10.0, 1000.0]      48655
(1000.0, 2000.0]       153
(2000.0, 3000.0]        41
(3000.0, 4000.0]        13
(4000.0, 5000.0]        12
(5000.0, 6000.0]         4
(6000.0, 7000.0]         5
(7000.0, 8000.0]         4
(8000.0, 9000.0]         1
(9000.0, 10000.0]        6
Name: price, dtype: int64

In [18]:
px.histogram(df, x="bairro_group", color="price_interval", nbins=10)

Quero retirar os grupos de bairro com preço entre 0 e 1000, pois essa faixa de preço existe para todos os bairros

In [19]:
df_copy = df.copy()
df_copy = df_copy[(df_copy['price'] > 1000) & (df_copy['price'] < 10000)]
df_copy

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365,price_interval
495,174966,Luxury 2Bed/2.5Bath Central Park View,836168,Henry,Manhattan,Upper West Side,40.77350,-73.98697,Entire home/apt,2000,30,30,2018-05-05,0.33,11,0,"(1000.0, 2000.0]"
761,273190,6 Bedroom Landmark West Village Townhouse,605463,West Village,Manhattan,West Village,40.73301,-74.00268,Entire home/apt,1300,5,28,2018-09-25,0.31,4,297,"(1000.0, 2000.0]"
945,363673,Beautiful 3 bedroom in Manhattan,256239,Tracey,Manhattan,Upper West Side,40.80142,-73.96931,Private room,3000,7,0,,,1,365,"(2000.0, 3000.0]"
1104,468613,$ (Phone number hidden by Airbnb) weeks - room f,2325861,Cynthia,Manhattan,Lower East Side,40.72152,-73.99279,Private room,1300,1,0,,,1,0,"(1000.0, 2000.0]"
1479,664047,Lux 2Bed/2.5Bath Central Park Views,836168,Henry,Manhattan,Upper West Side,40.77516,-73.98573,Entire home/apt,2000,30,59,2016-01-28,0.71,11,364,"(1000.0, 2000.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48079,36074198,Luxury apartment 2 min to times square,203565865,Vinícius,Manhattan,SoHo,40.72060,-74.00023,Entire home/apt,1308,2,0,,,1,179,"(1000.0, 2000.0]"
48303,36189195,Next to Times Square/Javits/MSG! Amazing 1BR!,270214015,Rogelio,Manhattan,Hell's Kitchen,40.75533,-73.99866,Entire home/apt,2999,30,0,,,1,222,"(2000.0, 3000.0]"
48304,36189257,2BR Near Museum Mile! Upper East Side!,272166348,Mary Rotsen,Manhattan,Upper East Side,40.78132,-73.95262,Entire home/apt,1999,30,0,,,1,270,"(1000.0, 2000.0]"
48522,36308562,"Tasteful & Trendy Brooklyn Brownstone, near Train",217732163,Sandy,Brooklyn,Bedford-Stuyvesant,40.68767,-73.95805,Entire home/apt,1369,1,0,,,1,349,"(1000.0, 2000.0]"


In [24]:
px.histogram(df_copy, x="bairro_group", color="price_interval", nbins=10)

- Manhathan e Brooklyn são os unicos bairros com preços entre 7k e 10k
- Queens, Staten Island e Bronx só possuem preços entre 0 e 5k
- Bronx possui apenas um spartamento entre 2k e 3k e o restante, entre 0 e 1k
- Sate Island possui apenas um apartamento entre 4 e 5k e o restante, entre 0 e 1k
- Queens possui apenas dois apartamentos entre 2k e 3k e o restante, entre 0 e 1k

Porém, a visualização não é muito clara quanto essas informações. Decidi criar um novo dataframe, em que cada coluna seja um bairro e cada linha um intervalo de preços:

In [40]:
manhattan_price_dist =df[df['bairro_group']== 'Manhattan']['price_interval'].value_counts()
brooklyn_price_dist =df[df['bairro_group']== 'Brooklyn']['price_interval'].value_counts()
queens_price_dist =df[df['bairro_group']== 'Queens']['price_interval'].value_counts()
staten_island_price_dist =df[df['bairro_group']== 'Staten Island']['price_interval'].value_counts()
bronx_price_dist =df[df['bairro_group']== 'Bronx']['price_interval'].value_counts()

In [42]:
df_price_dist = pd.DataFrame({
    'Manhattan': manhattan_price_dist,
    'Brooklyn': brooklyn_price_dist,
    'Queens': queens_price_dist,
    'Staten Island': staten_island_price_dist,
    'Bronx': bronx_price_dist
})
df_price_dist

Unnamed: 0_level_0,Manhattan,Brooklyn,Queens,Staten Island,Bronx
price_interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-10.0, 1000.0]",21489,20049,5656,371,1090
"(1000.0, 2000.0]",110,35,7,1,0
"(2000.0, 3000.0]",31,7,2,0,1
"(3000.0, 4000.0]",12,1,0,0,0
"(4000.0, 5000.0]",4,7,0,1,0
"(5000.0, 6000.0]",4,0,0,0,0
"(6000.0, 7000.0]",4,1,0,0,0
"(7000.0, 8000.0]",2,2,0,0,0
"(8000.0, 9000.0]",1,0,0,0,0
"(9000.0, 10000.0]",4,1,1,0,0


Bem melhor né? Fica claro que os preços entre 0 e 1000 são os mais comuns. Portanto, faz sentido a média dos preços ser 160




In [ ]:
Como a maior concentração de apartamento esta no intervalo entre 0 e 1000, gostaria de dividir esse intervalo em intervalos menores

In [35]:
df_price_until_1000 = df[df['price'] <= 1000]
df_price_until_1000

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365,price_interval
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,"(-10.0, 1000.0]"
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365,"(-10.0, 1000.0]"
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,"(-10.0, 1000.0]"
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0,"(-10.0, 1000.0]"
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,"(-10.0, 1000.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48889,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9,"(-10.0, 1000.0]"
48890,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36,"(-10.0, 1000.0]"
48891,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27,"(-10.0, 1000.0]"
48892,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2,"(-10.0, 1000.0]"


In [36]:
df_price_until_1000['price_interval'] = pd.cut(df_price_until_1000['price'], bins=10)
df_price_until_1000



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365,price_interval
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,"(200.0, 300.0]"
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365,"(100.0, 200.0]"
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,"(-1.0, 100.0]"
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0,"(-1.0, 100.0]"
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,"(100.0, 200.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48889,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9,"(-1.0, 100.0]"
48890,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36,"(-1.0, 100.0]"
48891,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27,"(100.0, 200.0]"
48892,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2,"(-1.0, 100.0]"


In [61]:
manhattan_price_dist_1000 =df_price_until_1000[df_price_until_1000['bairro_group']== 'Manhattan']['price_interval'].value_counts()
brooklyn_price_dist_1000 =df_price_until_1000[df_price_until_1000['bairro_group']== 'Brooklyn']['price_interval'].value_counts()
queens_price_dist_1000 =df_price_until_1000[df_price_until_1000['bairro_group']== 'Queens']['price_interval'].value_counts()
staten_island_price_dist_1000 =df_price_until_1000[df_price_until_1000['bairro_group']== 'Staten Island']['price_interval'].value_counts()
bronx_price_dist_1000 =df_price_until_1000[df_price_until_1000['bairro_group']== 'Bronx']['price_interval'].value_counts()

In [62]:
df_price_dist_1000 = pd.DataFrame({
    'Manhattan': manhattan_price_dist_1000,
    'Brooklyn': brooklyn_price_dist_1000,
    'Queens': queens_price_dist_1000,
    'Staten Island': staten_island_price_dist_1000,
    'Bronx': bronx_price_dist_1000
})
df_price_dist_1000

Unnamed: 0_level_0,Manhattan,Brooklyn,Queens,Staten Island,Bronx
price_interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-1.0, 100.0]",6921,11771,4101,270,865
"(100.0, 200.0]",8803,6277,1243,78,181
"(200.0, 300.0]",3485,1289,215,17,21
"(300.0, 400.0]",1157,374,53,0,10
"(400.0, 500.0]",522,163,25,2,7
"(500.0, 600.0]",185,72,7,0,2
"(600.0, 700.0]",147,34,4,2,2
"(700.0, 800.0]",129,36,2,1,1
"(800.0, 900.0]",56,8,3,0,0
"(900.0, 1000.0]",84,25,3,1,1


In [79]:
df_price_until_1000.groupby('bairro_group')['price'].agg(['min', 'mean', 'max', 'std', 'count'])

Unnamed: 0_level_0,min,mean,max,std,count
bairro_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bronx,0,85.283486,1000,77.768424,1090
Brooklyn,0,117.806374,1000,94.40816,20049
Manhattan,0,178.936293,1000,133.893998,21489
Queens,10,95.00831,1000,74.527596,5656
Staten Island,13,98.584906,1000,96.138752,371
