In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
#Import train and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
#Allocate the id to it's own variable - This will be used to submit
test_ids = test_data['Id']

In [5]:
columns_to_drop = ['Id']
X_train = train_data.drop(columns = columns_to_drop + ['SalePrice']) #Drop the ID and sale price and any other colums that do not help
y_train = train_data['SalePrice'] #Only look at the y column

X_test = test_data.drop(columns = columns_to_drop) #Do i need to drop saleprice as well here?

In [6]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns #Select all numeric features eg int and float(dtypes)
categorical_features = X_train.select_dtypes(include=['object']).columns #Select all categorical features (objects)

In [7]:
numeric_transformer = Pipeline(steps=[ #Use an imputer to fill the numeric values with the median/mean
    ('imputer',SimpleImputer(strategy='mean')),
])

In [8]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), #Fill missing values/Nan with the most frequent in that column
    ('onehot', OneHotEncoder(handle_unknown='ignore')) #Ignore nans with the encoding
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[ #Preprocess the data, split between number and categorical
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [17]:
#preprocessor - Use transformers on the data

In [10]:
#Create the pipeline after the preprocessor(transformers) with the preprocessor and the ML model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=123))])

In [11]:
pipeline.fit(X_train, y_train) #Pipeline is made, now fit it on the X_train and y_train

In [12]:
final_model = pipeline.fit(X_train, y_train) #Save the pipeline fitting to the final model

In [15]:
predictions = final_model.predict(X_test) #Use the final model to predict the x_tests)

In [16]:
predictions

array([120446.65319613, 164150.32087369, 176708.07696535, ...,
       156808.02392934, 123329.21648301, 235784.1615384 ])

In [None]:
submission = pd.DataFrame({ #predictions of the final model(on X_test) and the id's to use for kaggle submission
    'Id': test_ids,
    'SalePrice': predictions
})
print(submission)
submission.to_csv('submission.csv', index=False)