In [20]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 9999
pd.options.display.max_rows = 5
mel_df = pd.read_csv('melb_data.csv')
from sklearn.model_selection import train_test_split

In [2]:
df = pd.DataFrame({'col1': [2,4,1,5,6], 
                  'col2': [3,np.NaN,4,np.NaN,6],
                    'col3': [3,6,4,6,2]})

In [3]:
from sklearn.impute import SimpleImputer

#Three ways to fix missing values:
#1.drop column
df1 = df.drop(['col2'], axis=1)
#2.fill NaN fields with average
df1 = df
df1['col2'] = df['col2'].fillna(df['col2'].mean())
#or
my_imputer = SimpleImputer()
df4 = pd.DataFrame(my_imputer.fit_transform(df)) #imputer puts mean values in NaN and removes column names
df4.columns = df.columns #put column names back in
df4
#3.(BEST) fill NaN fields with average and create column for where row used to have a NaN value
dfcopy = df
cols_with_missing = [col for col in dfcopy.columns
                     if dfcopy[col].isnull().any()] #create array of columns with any missing value
for col in cols_with_missing:
    dfcopy[col + '_was_missing'] = dfcopy[col].isnull() #create new columns 
my_imputer = SimpleImputer()
df5 = pd.DataFrame(my_imputer.fit_transform(df))
df5.columns = dfcopy.columns #put column names back in
df5

Unnamed: 0,col1,col2,col3
0,2.0,3.0,3.0
1,4.0,4.333333,6.0
2,1.0,4.0,4.0
3,5.0,4.333333,6.0
4,6.0,6.0,2.0


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Three ways to fix categorical data:

#1. drop column

#2. Label Encoding, best for Ordinal data - used for data that has ranking - convert to integers
#using LabelEncoder() might not be best, sometimes better to manually set integers in order that makes sense
df = pd.DataFrame({'col1': [2,4,1,5,6], 
                  'col2': ["morning","night","afternoon","morning","night"],
                    'col3': [3,6,4,6,2]})
df1 = df
label_encoder = LabelEncoder()
df1['col2'] = label_encoder.fit_transform(df1['col2'])
df1

#3. Create new column for each value of category column
df_train = pd.DataFrame({'col1': [2,4,1,5,6], 
                  'col2': ["red","blue","yellow","blue","red"],
                  'col3': [3,6,4,6,2],
                  'col4': ['steel','wood','steel','wood','steel']})
# Get list of categorical variables
s = (df_train.dtypes == 'object')
object_cols = list(s[s].index)

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_df = pd.DataFrame(OH_encoder.fit_transform(df_train[object_cols]))
#use OneHotEncoder.transform() on test data after just using OneHotEncoder.fit_transform() on train data
#OH_cols_df = pd.DataFrame(OH_encoder.transform(df_test[object_cols]))
OH_cols_df
# One-hot encoding removed index; put it back
OH_cols_df.index = df_train.index
# Remove categorical columns (will replace with one-hot encoding)
num_df_train = df_train.drop(object_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_df_train = pd.concat([num_df_train, OH_cols_df], axis=1)
OH_df_train


Unnamed: 0,col1,col3,0,1,2,3,4
0,2,3,0.0,1.0,0.0,1.0,0.0
1,4,6,1.0,0.0,0.0,0.0,1.0
2,1,4,0.0,0.0,1.0,1.0,0.0
3,5,6,1.0,0.0,0.0,0.0,1.0
4,6,2,0.0,1.0,0.0,1.0,0.0


In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [11]:
df_train = pd.DataFrame({'col1': [2,np.NaN,np.NaN,np.NaN,6], 
                  'col2': ["red","blue","yellow","blue","red"],
                  'col3': [3,6,4,6,2],
                  'col4': ['steel','wood','steel','wood','steel']})
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['col1']),
        ('cat', categorical_transformer, ['col2','col4'])
    ])
x = preprocessor.fit_transform(df_train)

x

array([[2., 0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [6., 0., 1., 0., 1., 0.]])

In [5]:
#use pipelines to shorthand preproccessing and modeling
mel_df = mel_df[mel_df['Price'].notna()]
mel_df.drop('Address', axis=1)
y = mel_df['Price']
x = mel_df.loc[:, mel_df.columns != 'Price']
categorical_cols = ['Suburb','Type','Method','SellerG','Date','CouncilArea','Regionname']
numerical_cols = ['Rooms','Distance','Postcode','Bedroom2','Bathroom','Car','Landsize','BuildingArea',
                  'YearBuilt','Longtitude','Lattitude','Propertycount']
x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state = 0)
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])
# Preprocessing of training data, fit model 
my_pipeline.fit(x_train, y_train)
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(x_valid)
preds

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0


In [21]:
mel_df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0
13579,Yarraville,6 Agnes St,4,h,1285000.0,SP,Village,26/08/2017,6.3,3013.0,4.0,1.0,1.0,362.0,112.0,1920.0,,-37.81188,144.88449,Western Metropolitan,6543.0


NameError: name 'mean_absolute_error' is not defined

In [29]:
preds

array([1627400. ,  857110. ,  617770. , ..., 1365232.5,  628900. ,
        980400. ])

In [16]:
mel_df.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object