# Project 2

### Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import metrics

### Reading the files

In [3]:
sample_sub_reg = pd.read_csv('sample_sub_reg.csv')
sample_sub_reg.head()

Unnamed: 0,Id,SalePrice
0,2,181479.1217
1,4,181479.1217
2,6,181479.1217
3,7,181479.1217
4,17,181479.1217


In [4]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


### Data types, null values?

In [6]:
sample_sub_reg.dtypes

Id             int64
SalePrice    float64
dtype: object

In [7]:
train.dtypes

Id                int64
PID               int64
MS SubClass       int64
MS Zoning        object
Lot Frontage    float64
                 ...   
Misc Val          int64
Mo Sold           int64
Yr Sold           int64
Sale Type        object
SalePrice         int64
Length: 81, dtype: object

In [12]:
for column in train.columns:
    print(f'{column} type: {train[column].dtype}')

Id type: int64
PID type: int64
MS SubClass type: int64
MS Zoning type: object
Lot Frontage type: float64
Lot Area type: int64
Street type: object
Alley type: object
Lot Shape type: object
Land Contour type: object
Utilities type: object
Lot Config type: object
Land Slope type: object
Neighborhood type: object
Condition 1 type: object
Condition 2 type: object
Bldg Type type: object
House Style type: object
Overall Qual type: int64
Overall Cond type: int64
Year Built type: int64
Year Remod/Add type: int64
Roof Style type: object
Roof Matl type: object
Exterior 1st type: object
Exterior 2nd type: object
Mas Vnr Type type: object
Mas Vnr Area type: float64
Exter Qual type: object
Exter Cond type: object
Foundation type: object
Bsmt Qual type: object
Bsmt Cond type: object
Bsmt Exposure type: object
BsmtFin Type 1 type: object
BsmtFin SF 1 type: float64
BsmtFin Type 2 type: object
BsmtFin SF 2 type: float64
Bsmt Unf SF type: float64
Total Bsmt SF type: float64
Heating type: object
Heating Q

### Duplicates?

In [13]:
train[train.duplicated(keep=False)]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice


Great!

In [14]:
train.isnull().sum()

Id                0
PID               0
MS SubClass       0
MS Zoning         0
Lot Frontage    330
               ... 
Misc Val          0
Mo Sold           0
Yr Sold           0
Sale Type         0
SalePrice         0
Length: 81, dtype: int64

In [15]:
for column in train.columns:
    print(f'{column} nulls: {train[column].isnull().sum()}')

Id nulls: 0
PID nulls: 0
MS SubClass nulls: 0
MS Zoning nulls: 0
Lot Frontage nulls: 330
Lot Area nulls: 0
Street nulls: 0
Alley nulls: 1911
Lot Shape nulls: 0
Land Contour nulls: 0
Utilities nulls: 0
Lot Config nulls: 0
Land Slope nulls: 0
Neighborhood nulls: 0
Condition 1 nulls: 0
Condition 2 nulls: 0
Bldg Type nulls: 0
House Style nulls: 0
Overall Qual nulls: 0
Overall Cond nulls: 0
Year Built nulls: 0
Year Remod/Add nulls: 0
Roof Style nulls: 0
Roof Matl nulls: 0
Exterior 1st nulls: 0
Exterior 2nd nulls: 0
Mas Vnr Type nulls: 22
Mas Vnr Area nulls: 22
Exter Qual nulls: 0
Exter Cond nulls: 0
Foundation nulls: 0
Bsmt Qual nulls: 55
Bsmt Cond nulls: 55
Bsmt Exposure nulls: 58
BsmtFin Type 1 nulls: 55
BsmtFin SF 1 nulls: 1
BsmtFin Type 2 nulls: 56
BsmtFin SF 2 nulls: 1
Bsmt Unf SF nulls: 1
Total Bsmt SF nulls: 1
Heating nulls: 0
Heating QC nulls: 0
Central Air nulls: 0
Electrical nulls: 0
1st Flr SF nulls: 0
2nd Flr SF nulls: 0
Low Qual Fin SF nulls: 0
Gr Liv Area nulls: 0
Bsmt Full Ba