# Analysis Report V
Handing missing values

## Loading dataset

In [1]:
# libs
import pandas as pd

In [2]:
df_residential = pd.read_csv('datasets/residential.csv', sep=';')
print(df_residential.shape)
df_residential.head()

(23060, 9)


Unnamed: 0,Tipo,Bairro,Quartos,Vagas,Suites,Area,Valor,Condominio,IPTU
0,Quitinete,Copacabana,1,0,0,40,1700.0,500.0,60.0
1,Casa,Jardim Botânico,2,0,1,100,7000.0,,
2,Apartamento,Centro,1,0,0,15,800.0,390.0,20.0
3,Apartamento,Higienópolis,1,0,0,48,800.0,230.0,
4,Apartamento,Vista Alegre,3,1,0,70,1200.0,,


## Exploring Data

In [3]:
# Checking columns and types
df_residential.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23060 entries, 0 to 23059
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Tipo        23060 non-null  object 
 1   Bairro      23060 non-null  object 
 2   Quartos     23060 non-null  int64  
 3   Vagas       23060 non-null  int64  
 4   Suites      23060 non-null  int64  
 5   Area        23060 non-null  int64  
 6   Valor       23051 non-null  float64
 7   Condominio  21200 non-null  float64
 8   IPTU        16195 non-null  float64
dtypes: float64(3), int64(4), object(2)
memory usage: 1.6+ MB


### Null Values

In [4]:
# checking null values
print(df_residential.isnull().sum())

Tipo             0
Bairro           0
Quartos          0
Vagas            0
Suites           0
Area             0
Valor            9
Condominio    1860
IPTU          6865
dtype: int64


In [5]:
# deleting entries where 'Valor' value is null
df_residential.dropna(subset=['Valor'], inplace=True)

df_residential

Unnamed: 0,Tipo,Bairro,Quartos,Vagas,Suites,Area,Valor,Condominio,IPTU
0,Quitinete,Copacabana,1,0,0,40,1700.0,500.0,60.0
1,Casa,Jardim Botânico,2,0,1,100,7000.0,,
2,Apartamento,Centro,1,0,0,15,800.0,390.0,20.0
3,Apartamento,Higienópolis,1,0,0,48,800.0,230.0,
4,Apartamento,Vista Alegre,3,1,0,70,1200.0,,
...,...,...,...,...,...,...,...,...,...
23055,Apartamento,Méier,2,0,0,70,900.0,490.0,48.0
23056,Quitinete,Centro,0,0,0,27,800.0,350.0,25.0
23057,Apartamento,Jacarepaguá,3,1,2,78,1800.0,800.0,40.0
23058,Apartamento,São Francisco Xavier,2,1,0,48,1400.0,509.0,37.0


In [6]:
# deleting entries where 'Tipo' is apartment but 'Condominio' is empty
df_residential = df_residential.loc[~(df_residential.Tipo == 'Apartmento') & (df_residential.Condominio.isnull())]

df_residential

Unnamed: 0,Tipo,Bairro,Quartos,Vagas,Suites,Area,Valor,Condominio,IPTU
1,Casa,Jardim Botânico,2,0,1,100,7000.0,,
4,Apartamento,Vista Alegre,3,1,0,70,1200.0,,
6,Casa de Condomínio,Barra da Tijuca,5,4,5,750,22000.0,,
7,Casa de Condomínio,Ramos,2,2,0,65,1000.0,,
8,Apartamento,Centro,1,0,0,36,1200.0,,
...,...,...,...,...,...,...,...,...,...
22990,Casa,Campo Grande,1,0,0,50,600.0,,
22996,Casa de Vila,Pavuna,2,0,0,55,600.0,,
23015,Casa,Recreio dos Bandeirantes,4,2,2,352,6500.0,,
23027,Casa,Bento Ribeiro,2,1,0,60,1400.0,,


In [7]:
# treating entries where 'IPTU' is null
df_residential = df_residential.fillna({'Condominio':0, 'IPTU':0})

df_residential

Unnamed: 0,Tipo,Bairro,Quartos,Vagas,Suites,Area,Valor,Condominio,IPTU
1,Casa,Jardim Botânico,2,0,1,100,7000.0,0.0,0.0
4,Apartamento,Vista Alegre,3,1,0,70,1200.0,0.0,0.0
6,Casa de Condomínio,Barra da Tijuca,5,4,5,750,22000.0,0.0,0.0
7,Casa de Condomínio,Ramos,2,2,0,65,1000.0,0.0,0.0
8,Apartamento,Centro,1,0,0,36,1200.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
22990,Casa,Campo Grande,1,0,0,50,600.0,0.0,0.0
22996,Casa de Vila,Pavuna,2,0,0,55,600.0,0.0,0.0
23015,Casa,Recreio dos Bandeirantes,4,2,2,352,6500.0,0.0,0.0
23027,Casa,Bento Ribeiro,2,1,0,60,1400.0,0.0,0.0


## Exporting Dataset

In [8]:
# saving to .csv
df_residential.to_csv('datasets/residential_prep.csv', sep=';', index=False)