## Data Cleaning Stage 2

Complete Data

Notes: 
 - Cleaned data is  stored in `./clean_data`

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Read cleaned Data
dt = pd.read_csv("./clean_data/clean_dataComplete.csv")

In [4]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 533609 entries, 0 to 533608
Data columns (total 43 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Unnamed: 0                            533609 non-null  int64  
 1   CentralAir-conditioning               518307 non-null  float64
 2   Foundation                            526045 non-null  float64
 3   CategoryCode                          533609 non-null  int64  
 4   ClosingCostsinSalesPrice              533609 non-null  int64  
 5   Condominium                           533609 non-null  int64  
 6   Deck                                  512504 non-null  float64
 7   DesignofHouse                         532494 non-null  float64
 8   Division                              533609 non-null  int64  
 9   TypeofFinancing                       373489 non-null  float64
 10  FinishedBasement                      533609 non-null  int64  
 11  

In [5]:
#Beginning with the file for 2017, houses that are built for rent will be included in the 
#"1-Built for Sale/Sold" category rather than being shown separately.#

#Joining categories 1 and 4 together as per description
dt.loc[dt['CategoryCode']== 4, 'CategoryCode'] = 1

In [6]:
# Choosing the houses of category = houses sold
dfs=dt[dt['CategoryCode']==1]

In [7]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392286 entries, 0 to 533530
Data columns (total 43 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Unnamed: 0                            392286 non-null  int64  
 1   CentralAir-conditioning               384576 non-null  float64
 2   Foundation                            388165 non-null  float64
 3   CategoryCode                          392286 non-null  int64  
 4   ClosingCostsinSalesPrice              392286 non-null  int64  
 5   Condominium                           392286 non-null  int64  
 6   Deck                                  380805 non-null  float64
 7   DesignofHouse                         391562 non-null  float64
 8   Division                              392286 non-null  int64  
 9   TypeofFinancing                       262740 non-null  float64
 10  FinishedBasement                      392286 non-null  int64  
 11  

In [8]:
#Dropping unecessary data 
dfs=dfs.drop(['CategoryCode','StartDate','CompletionDate','ContractPrice','SquareFootAreaoftheLotFlag',
              'SquareFootAreaofFinishedBasementFlag','SalesPriceFlag','ContractPriceFlag','LotValueFlag',
              'SquareFootAreaofHouseFlag','PermitAuthorizationDate','ClosingCostsinSalesPrice'], axis=1)

In [9]:
# Dropping data samples having null sale price (Target)
dfs = dfs.dropna(subset = {'SalesPrice'})
dfs=dfs.reset_index(drop=True)

## Nulls, Zeros, categories

In [10]:
# Checking nulls
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279612 entries, 0 to 279611
Data columns (total 31 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   Unnamed: 0                        279612 non-null  int64  
 1   CentralAir-conditioning           277773 non-null  float64
 2   Foundation                        278958 non-null  float64
 3   Condominium                       279612 non-null  int64  
 4   Deck                              276964 non-null  float64
 5   DesignofHouse                     279555 non-null  float64
 6   Division                          279612 non-null  int64  
 7   TypeofFinancing                   247556 non-null  float64
 8   FinishedBasement                  279612 non-null  int64  
 9   ParkingFacility                   279612 non-null  int64  
 10  PrimarySpaceHeatingSystem         277959 non-null  float64
 11  MetropolitanArea                  279612 non-null  i

In [11]:
dfs.isnull().sum()

Unnamed: 0                               0
CentralAir-conditioning               1839
Foundation                             654
Condominium                              0
Deck                                  2648
DesignofHouse                           57
Division                                 0
TypeofFinancing                      32056
FinishedBasement                         0
ParkingFacility                          0
PrimarySpaceHeatingSystem             1653
MetropolitanArea                         0
ConstructionMethod                     456
Patio                                 2704
Porch                                 2143
Stories                                652
PrimaryExteriorWallMaterial           1598
SecondaryExteriorWallMaterial        46279
SecondaryExteriorWallMaterial.1       3051
Bedrooms                              1404
Fireplace                           114887
FullBathrooms                            0
HalfBathrooms                       128808
SaleDate   

Condominium variable

In [12]:
# Check condominium 
dfs['Condominium'].value_counts()
# Condominium applies only to houses sold ==> All zeroes are nulls

2    259334
1     20027
0       251
Name: Condominium, dtype: int64

In [13]:
# Changing zeroes in cond. to nan
dfs.loc[dfs['Condominium']==0, 'Condominium'] = dfs['Condominium'].replace(0,np.nan)

Parking Facility Variable

In [14]:
# Zero indicates missing data
dfs['ParkingFacility'].value_counts()

2    187904
3     60340
1     17500
4     12857
0      1011
Name: ParkingFacility, dtype: int64

In [15]:
# Changing zeroes in ParkFAC to nan
dfs.loc[dfs['ParkingFacility']==0, 'ParkingFacility'] = dfs['ParkingFacility'].replace(0,np.nan)

In [16]:
dfs.isnull().sum()

Unnamed: 0                               0
CentralAir-conditioning               1839
Foundation                             654
Condominium                            251
Deck                                  2648
DesignofHouse                           57
Division                                 0
TypeofFinancing                      32056
FinishedBasement                         0
ParkingFacility                       1011
PrimarySpaceHeatingSystem             1653
MetropolitanArea                         0
ConstructionMethod                     456
Patio                                 2704
Porch                                 2143
Stories                                652
PrimaryExteriorWallMaterial           1598
SecondaryExteriorWallMaterial        46279
SecondaryExteriorWallMaterial.1       3051
Bedrooms                              1404
Fireplace                           114887
FullBathrooms                            0
HalfBathrooms                       128808
SaleDate   

PrimarySpaceHeatingSystem

In [17]:
dfs['PrimarySpaceHeatingFuel'].value_counts()

2.0    187501
1.0     79366
3.0      6723
4.0      3347
5.0       919
Name: PrimarySpaceHeatingFuel, dtype: int64

In [18]:
# turning category 3 into 2 as per description
dfs.loc[dfs['PrimarySpaceHeatingFuel']==3, 'PrimarySpaceHeatingFuel'] = 2

In [19]:
dfs['PrimarySpaceHeatingFuel'].value_counts()

2.0    194224
1.0     79366
4.0      3347
5.0       919
Name: PrimarySpaceHeatingFuel, dtype: int64

In [20]:
#Adjust the possible values and categories
for i in range(4,6):
    dfs.loc[dfs['PrimarySpaceHeatingFuel']== i, 'PrimarySpaceHeatingFuel'] = i-1

SecondaryExteriorWallMaterial

In [21]:
# Check values for secondary exteriod wall material
dfs ['SecondaryExteriorWallMaterial'].value_counts().sort_index()

1.0      6802
2.0     48058
3.0       270
4.0      9607
5.0     12223
6.0       697
7.0     41990
8.0     17317
9.0       816
10.0    95553
Name: SecondaryExteriorWallMaterial, dtype: int64

In [22]:
##changing the NaN for SWEM into 0 for SWEM.1 =2
dfs.loc[dfs['SecondaryExteriorWallMaterial.1']==2, 'SecondaryExteriorWallMaterial'] = dfs['SecondaryExteriorWallMaterial'].replace(np.NaN,0)

In [23]:
dfs.isnull().sum()

Unnamed: 0                               0
CentralAir-conditioning               1839
Foundation                             654
Condominium                            251
Deck                                  2648
DesignofHouse                           57
Division                                 0
TypeofFinancing                      32056
FinishedBasement                         0
ParkingFacility                       1011
PrimarySpaceHeatingSystem             1653
MetropolitanArea                         0
ConstructionMethod                     456
Patio                                 2704
Porch                                 2143
Stories                                652
PrimaryExteriorWallMaterial           1598
SecondaryExteriorWallMaterial         3151
SecondaryExteriorWallMaterial.1       3051
Bedrooms                              1404
Fireplace                           114887
FullBathrooms                            0
HalfBathrooms                       128808
SaleDate   

In [24]:
# Joining 10 category with 0 as per description
dfs.loc[dfs['SecondaryExteriorWallMaterial']== 10, 'SecondaryExteriorWallMaterial'] = 0

In [25]:
dfs ['SecondaryExteriorWallMaterial'].value_counts().sort_index()

0.0    138681
1.0      6802
2.0     48058
3.0       270
4.0      9607
5.0     12223
6.0       697
7.0     41990
8.0     17317
9.0       816
Name: SecondaryExteriorWallMaterial, dtype: int64

Stories Variable

In [26]:
dfs['Stories'].value_counts()

2.0    167172
1.0    107965
3.0      3823
Name: Stories, dtype: int64

In [27]:
# Turning categories 3 to 2 as per description
dfs.loc[dfs['Stories']== 3, 'Stories'] = 2

primary exterial wall variable

In [28]:
dfs['PrimaryExteriorWallMaterial'].value_counts()

5.0    89128
4.0    68529
2.0    60308
8.0    29458
1.0    16627
6.0     8196
7.0     2506
9.0     1817
3.0     1445
Name: PrimaryExteriorWallMaterial, dtype: int64

In [29]:
# Turning categories 3 to 9 as per description
dfs.loc[dfs['PrimaryExteriorWallMaterial']== 3, 'PrimaryExteriorWallMaterial'] = 9

In [30]:
dfs['PrimaryExteriorWallMaterial'].value_counts().sort_index()

1.0    16627
2.0    60308
4.0    68529
5.0    89128
6.0     8196
7.0     2506
8.0    29458
9.0     3262
Name: PrimaryExteriorWallMaterial, dtype: int64

In [31]:
#Adjust the possible values and categories
for i in range(4,10):
    dfs.loc[dfs['PrimaryExteriorWallMaterial']== i, 'PrimaryExteriorWallMaterial'] = i-1

In [32]:
dfs['PrimaryExteriorWallMaterial'].value_counts().sort_index()

1.0    16627
2.0    60308
3.0    68529
4.0    89128
5.0     8196
6.0     2506
7.0    29458
8.0     3262
Name: PrimaryExteriorWallMaterial, dtype: int64

Fireplace Variable

In [33]:
# Turning Nan into 0 as per description
dfs.loc[dfs['Fireplace'].isnull(), 'Fireplace'] = dfs['Fireplace'].replace(np.nan,0)

In [34]:
# Turning 9 into nan as per description
dfs.loc[dfs['Fireplace']==9, 'Fireplace'] = dfs['Fireplace'].replace(9, np.nan)

In [35]:
dfs['Fireplace'].value_counts().sort_index()

0.0    114887
1.0    147415
2.0     14035
Name: Fireplace, dtype: int64

Half and Full Bathroom

In [36]:
# Turning 9 into nan as per description
dfs.loc[dfs['FullBathrooms']==9, 'FullBathrooms'] = dfs['FullBathrooms'].replace(9, np.nan)

In [37]:
# Turning nan into 0 as per description
dfs.loc[dfs['HalfBathrooms'].isnull(), 'HalfBathrooms'] = dfs['HalfBathrooms'].replace(np.nan,0)

In [38]:
# Turning 9 into nan as per description
dfs.loc[dfs['HalfBathrooms']==9, 'HalfBathrooms'] = dfs['HalfBathrooms'].replace(9,np.nan)

In [39]:
dfs.isnull().sum()

Unnamed: 0                               0
CentralAir-conditioning               1839
Foundation                             654
Condominium                            251
Deck                                  2648
DesignofHouse                           57
Division                                 0
TypeofFinancing                      32056
FinishedBasement                         0
ParkingFacility                       1011
PrimarySpaceHeatingSystem             1653
MetropolitanArea                         0
ConstructionMethod                     456
Patio                                 2704
Porch                                 2143
Stories                                652
PrimaryExteriorWallMaterial           1598
SecondaryExteriorWallMaterial         3151
SecondaryExteriorWallMaterial.1       3051
Bedrooms                              1404
Fireplace                             3275
FullBathrooms                         1380
HalfBathrooms                         1525
SaleDate   

Lot value

In [40]:
# Checking Lot Value
len(dfs[dfs['LotValue']==0])
# This is the number of houses not applicable and not reported
# LotValue is applicable to all houses sold ==> All zeros are NaN

85323

Finished basement, Foundation, Square foot area

In [41]:
# changing 0 to NaN in Lot Value
dfs.loc[dfs['LotValue']== 0, 'LotValue'] = dfs['LotValue'].replace(0,np.NaN)

In [42]:
#Finished Basment is 1 or 2 if Foundation == 1, otherwise it is a NaN
# Check values iN finished Bsmnt
dfs['FinishedBasement'].value_counts()

0    189556
2     69564
1     20492
Name: FinishedBasement, dtype: int64

In [43]:
##changing the 0 values for FN BS into NaN for foundation category 1, other 0s means not applicable
dfs.loc[dfs['Foundation']== 1, 'FinishedBasement'] = dfs['FinishedBasement'].replace(0,np.NaN)

In [44]:
# Check values in finished Bsmnt
dfs['FinishedBasement'].value_counts()

0.0    188675
2.0     69564
1.0     20492
Name: FinishedBasement, dtype: int64

In [45]:
dfs.isnull().sum()

Unnamed: 0                               0
CentralAir-conditioning               1839
Foundation                             654
Condominium                            251
Deck                                  2648
DesignofHouse                           57
Division                                 0
TypeofFinancing                      32056
FinishedBasement                       881
ParkingFacility                       1011
PrimarySpaceHeatingSystem             1653
MetropolitanArea                         0
ConstructionMethod                     456
Patio                                 2704
Porch                                 2143
Stories                                652
PrimaryExteriorWallMaterial           1598
SecondaryExteriorWallMaterial         3151
SecondaryExteriorWallMaterial.1       3051
Bedrooms                              1404
Fireplace                             3275
FullBathrooms                         1380
HalfBathrooms                         1525
SaleDate   

In [46]:
# Transferring nulls in SquareFootAreaofFinishedBasement into zeros for foundation != 1
dfs.loc[dfs['Foundation']!=1, 'SquareFootAreaofFinishedBasement'] = dfs['SquareFootAreaofFinishedBasement'].replace(np.NaN, 0)

In [47]:
# Transferring nulls in SquareFootAreaofFinishedBasement into zeroes for FinishedBasement==2
dfs.loc[dfs['FinishedBasement']==2, 'SquareFootAreaofFinishedBasement'] = dfs['SquareFootAreaofFinishedBasement'].replace(np.NaN, 0)

In [48]:
dfs.isnull().sum() ##Bingo!!!!

Unnamed: 0                              0
CentralAir-conditioning              1839
Foundation                            654
Condominium                           251
Deck                                 2648
DesignofHouse                          57
Division                                0
TypeofFinancing                     32056
FinishedBasement                      881
ParkingFacility                      1011
PrimarySpaceHeatingSystem            1653
MetropolitanArea                        0
ConstructionMethod                    456
Patio                                2704
Porch                                2143
Stories                               652
PrimaryExteriorWallMaterial          1598
SecondaryExteriorWallMaterial        3151
SecondaryExteriorWallMaterial.1      3051
Bedrooms                             1404
Fireplace                            3275
FullBathrooms                        1380
HalfBathrooms                        1525
SaleDate                          

In [49]:
# Cross-check : if squarefootareaoffinishedbasement is equal to a non-zero then it is definitely FinishedBasement Category 1
dfs.loc[dfs['SquareFootAreaofFinishedBasement']!=0, 'FinishedBasement'] = dfs['FinishedBasement'].replace(np.NaN, 1)

In [50]:
dfs.isnull().sum() ##Bingo!!!

Unnamed: 0                              0
CentralAir-conditioning              1839
Foundation                            654
Condominium                           251
Deck                                 2648
DesignofHouse                          57
Division                                0
TypeofFinancing                     32056
FinishedBasement                        0
ParkingFacility                      1011
PrimarySpaceHeatingSystem            1653
MetropolitanArea                        0
ConstructionMethod                    456
Patio                                2704
Porch                                2143
Stories                               652
PrimaryExteriorWallMaterial          1598
SecondaryExteriorWallMaterial        3151
SecondaryExteriorWallMaterial.1      3051
Bedrooms                             1404
Fireplace                            3275
FullBathrooms                        1380
HalfBathrooms                        1525
SaleDate                          

In [51]:
# Logically speaking if the  Foundation nulls are unknown we would never knwo whether 0 means not applicable or not reported, as such let them be nulls
dfs.loc[dfs['Foundation'].isnull(), 'FinishedBasement'] = dfs['FinishedBasement'].replace(0,np.nan)

In [52]:
dfs.isnull().sum()

Unnamed: 0                              0
CentralAir-conditioning              1839
Foundation                            654
Condominium                           251
Deck                                 2648
DesignofHouse                          57
Division                                0
TypeofFinancing                     32056
FinishedBasement                      654
ParkingFacility                      1011
PrimarySpaceHeatingSystem            1653
MetropolitanArea                        0
ConstructionMethod                    456
Patio                                2704
Porch                                2143
Stories                               652
PrimaryExteriorWallMaterial          1598
SecondaryExteriorWallMaterial        3151
SecondaryExteriorWallMaterial.1      3051
Bedrooms                             1404
Fireplace                            3275
FullBathrooms                        1380
HalfBathrooms                        1525
SaleDate                          

In [53]:
dfdrop = dfs.dropna()
dfdrop0=dfdrop.reset_index(drop=True)

In [54]:
dfdrop.info() # That's more than enough !!!

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158470 entries, 0 to 279610
Data columns (total 31 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   Unnamed: 0                        158470 non-null  int64  
 1   CentralAir-conditioning           158470 non-null  float64
 2   Foundation                        158470 non-null  float64
 3   Condominium                       158470 non-null  float64
 4   Deck                              158470 non-null  float64
 5   DesignofHouse                     158470 non-null  float64
 6   Division                          158470 non-null  int64  
 7   TypeofFinancing                   158470 non-null  float64
 8   FinishedBasement                  158470 non-null  float64
 9   ParkingFacility                   158470 non-null  float64
 10  PrimarySpaceHeatingSystem         158470 non-null  float64
 11  MetropolitanArea                  158470 non-null  i

In [58]:
dfs.to_csv(r"./clean_data/cleanedDataPrice.csv")

In [59]:
dfdrop.to_csv(r"./clean_data/cleanedDataPriceDr.csv")

PermissionError: [Errno 13] Permission denied: './clean_data/cleanedDataPriceDr.csv'