# Basic Data Operations

In [None]:
!python -m pip install --upgrade pip
!pip uninstall -y numpy
!pip uninstall -y setuptools
!pip install setuptools
!pip install numpy==1.20.3

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("train.csv")
df.shape

(1460, 81)

### Getting basic information about the data (basic exploratory analysis)

In [2]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [3]:
# Getting the datatypes
df.dtypes.value_counts()

object     43
int64      35
float64     3
dtype: int64

In [4]:
# Getting certain types of columns
int_cols = df.loc[:, df.dtypes == np.int64].columns
int_cols

Index(['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [5]:
object_cols = df.loc[:, df.dtypes == np.object].columns
object_cols

  object_cols = df.loc[:, df.dtypes == np.object].columns


AttributeError: module 'numpy' has no attribute 'object'

In [6]:
float_cols = df.loc[:, df.dtypes == np.float64].columns
float_cols

Index(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], dtype='object')

### Exploring the values

In [7]:
df_objects = df[object_cols]
df_objects.head()

NameError: name 'object_cols' is not defined

In [8]:
df_objects.MSZoning.value_counts()

NameError: name 'df_objects' is not defined

In [9]:
df_objects.Alley.value_counts() # Wait a sec, it's only 91 rows because of the NaN's

NameError: name 'df_objects' is not defined

### Delete the rows with at least one NaN

In [11]:
df.Alley.isna() # Give true on NaN value

0       True
1       True
2       True
3       True
4       True
        ... 
1455    True
1456    True
1457    True
1458    True
1459    True
Name: Alley, Length: 1460, dtype: bool

In [12]:
any(df.Alley.isna())

True

In [13]:
all(df.Alley.isna())

False

In [14]:
any(df_objects.MSZoning.isna())

NameError: name 'df_objects' is not defined

In [10]:
df.dropna(axis=1, inplace=True)

NameError: name 'df_object' is not defined

In [15]:
# dropping rows based on nan in a certain column
df[df['Alley'].notna()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
21,22,45,RM,57.0,7449,Pave,Grvl,Reg,Bnk,AllPub,...,0,,GdPrv,,0,6,2007,WD,Normal,139400
30,31,70,C (all),50.0,8500,Pave,Pave,Reg,Lvl,AllPub,...,0,,MnPrv,,0,7,2008,WD,Normal,40000
56,57,160,FV,24.0,2645,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,8,2009,WD,Abnorml,172500
79,80,50,RM,60.0,10440,Pave,Grvl,Reg,Lvl,AllPub,...,0,,MnPrv,,0,5,2009,WD,Normal,110000
87,88,160,FV,40.0,3951,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,6,2009,New,Partial,164500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,1405,50,RL,60.0,10410,Pave,Grvl,Reg,Lvl,AllPub,...,0,,MnPrv,,0,1,2006,WD,Family,105000
1414,1415,50,RL,64.0,13053,Pave,Pave,Reg,Bnk,AllPub,...,0,,,,0,6,2008,WD,Normal,207000
1427,1428,50,RL,60.0,10930,Pave,Grvl,Reg,Bnk,AllPub,...,0,,,,0,4,2008,WD,Normal,140000
1432,1433,30,RL,60.0,10800,Pave,Grvl,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,64500


In [16]:
# dropping rows based on nan in certain columns is complicated
df[df[['Alley', 'Fence']].notna().apply(any, axis=1)]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
14,15,20,RL,,10920,Pave,,IR1,Lvl,AllPub,...,0,,GdWo,,0,5,2008,WD,Normal,157000
15,16,45,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,,0,7,2007,WD,Normal,132000
19,20,20,RL,70.0,7560,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,5,2009,COD,Abnorml,139000
21,22,45,RM,57.0,7449,Pave,Grvl,Reg,Bnk,AllPub,...,0,,GdPrv,,0,6,2007,WD,Normal,139400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1438,1439,20,RM,90.0,7407,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,4,2010,WD,Normal,149700
1448,1449,50,RL,70.0,11767,Pave,,Reg,Lvl,AllPub,...,0,,GdWo,,0,5,2007,WD,Normal,112000
1454,1455,20,FV,62.0,7500,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,10,2009,WD,Normal,185000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000


In [None]:
df.columns.difference(object_cols) # selects only the numerical columns, and excludes the object columns

In [None]:
# Finding the columns with at least one nan
df[df.columns.difference(object_cols)].isna().apply(any)

In [17]:
any(df["GarageYrBlt"].isna())

True

In [21]:
# replace nan with the mean
df.loc[:, "GarageYrBlt"] = df.GarageYrBlt.mean()
any(df["GarageYrBlt"]).isna()

False

In [25]:
# replace with min
df.loc[df["GarageYrBlt"].isna(), "GarageYrBlt"] = df.GarageYrBlt.min()
df.loc[df["GarageYrBlt"].isna(), "GarageYrBlt"] = df.GarageYrBlt.max()
df.loc[df["GarageYrBlt"].isna(), "GarageYrBlt"] = df.GarageYrBlt.median()

### Data analysis

In [26]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [30]:
df.GarageType.describe()

count       1379
unique         6
top       Attchd
freq         870
Name: GarageType, dtype: object

In [29]:
df.GarageYrBlt.describe()

count    1.460000e+03
mean     1.978506e+03
std      6.823547e-13
min      1.978506e+03
25%      1.978506e+03
50%      1.978506e+03
75%      1.978506e+03
max      1.978506e+03
Name: GarageYrBlt, dtype: float64