<a href="https://colab.research.google.com/github/coding-dojo-data-science/example-kaggle-lesson/blob/main/SOLUTIONS_4_18_Regression_Kaggle_Competition_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kaggle Competition

## Regression: [Housing Prices Regression](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

# Custom Functions

In [2]:
def explore_data(df):
  """display .info(), num duplicates, num missing values, categories in 
  categorical columns, and descriptive statistics of numeric columns
  """
  
  # Explore Columns
  print('Columns:')
  display(df.info())
  print('\n')

  # Find duplicates
  print(f'{df.duplicated().sum()} Duplicates Found \n')

  # Check for missing values
  print('Missing Values')
  display(df.isna().sum())
  print('\n')

  # Examine categories
  for col in df.select_dtypes(include='object').columns:
    print(col)
    display(df[col].value_counts())
    print('\n')

  # Show Summary Statistics
  display(df.describe(include='all'))

def split_data(df, target, random_state=42):
  """Split df features and target and perform train/test split.
  returns 4 values: X_train, X_test, y_train, y_test.
  optional argument: random_state
  """

  # Split columns
  X = df.drop(target, axis=1)
  y = df[target]

  # Return train/test split (4 values)
  return train_test_split(X, y, random_state=random_state)

def evaluate_regression(model, X, y, index=[0]):
  """Evaluates a model on a given set of features and target.
  model should be already fitted.
  Returns a dataframe with metrics as columns and one row
  optional argument: index value for the row.  default: 0
  """

  preds = model.predict(X)
  scores = {'R2': r2_score(y, preds),
            'MAE': mean_absolute_error(y, preds),
            'MSE': mean_squared_error(y, preds),
            'RMSE': np.sqrt(mean_squared_error(y, preds))}
  scores = pd.DataFrame(scores, index=index)
  
  return scores

def try_regmodel(model, X_train, X_test, y_train, y_test, param_grid=None):
  """fits and evaluates a model.  Returns a fitted model and a dataframe of scores
  for both train and test datasets.
  If a param_grid is provided, will fit and return a GridSearchCV object 
  using that grid.
  """

  if param_grid:
    model = GridSearchCV(model, param_grid)
  model.fit(X_train, y_train)

  train_scores = evaluate_regression(model, X_train, y_train, index=['Train'])
  test_scores = evaluate_regression(model, X_test, y_test, index=['Test'])

  scores = pd.concat([train_scores, test_scores], axis=0)
  return model, scores

# Import Data

In [3]:
train_path = '/content/train.csv'
train_df = pd.read_csv(train_path)

train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Lower case columns for ease
train_df.columns = train_df.columns.str.lower()

# EDA and Cleaning

In [5]:
explore_data(train_df)

Columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1460 non-null   int64  
 1   mssubclass     1460 non-null   int64  
 2   mszoning       1460 non-null   object 
 3   lotfrontage    1201 non-null   float64
 4   lotarea        1460 non-null   int64  
 5   street         1460 non-null   object 
 6   alley          91 non-null     object 
 7   lotshape       1460 non-null   object 
 8   landcontour    1460 non-null   object 
 9   utilities      1460 non-null   object 
 10  lotconfig      1460 non-null   object 
 11  landslope      1460 non-null   object 
 12  neighborhood   1460 non-null   object 
 13  condition1     1460 non-null   object 
 14  condition2     1460 non-null   object 
 15  bldgtype       1460 non-null   object 
 16  housestyle     1460 non-null   object 
 17  overallqual    1460 non-null   int64  
 18 

None



0 Duplicates Found 

Missing Values


id                 0
mssubclass         0
mszoning           0
lotfrontage      259
lotarea            0
                ... 
mosold             0
yrsold             0
saletype           0
salecondition      0
saleprice          0
Length: 81, dtype: int64



mszoning


RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: mszoning, dtype: int64



street


Pave    1454
Grvl       6
Name: street, dtype: int64



alley


Grvl    50
Pave    41
Name: alley, dtype: int64



lotshape


Reg    925
IR1    484
IR2     41
IR3     10
Name: lotshape, dtype: int64



landcontour


Lvl    1311
Bnk      63
HLS      50
Low      36
Name: landcontour, dtype: int64



utilities


AllPub    1459
NoSeWa       1
Name: utilities, dtype: int64



lotconfig


Inside     1052
Corner      263
CulDSac      94
FR2          47
FR3           4
Name: lotconfig, dtype: int64



landslope


Gtl    1382
Mod      65
Sev      13
Name: landslope, dtype: int64



neighborhood


NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
StoneBr     25
SWISU       25
MeadowV     17
Blmngtn     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: neighborhood, dtype: int64



condition1


Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: condition1, dtype: int64



condition2


Norm      1445
Feedr        6
Artery       2
RRNn         2
PosN         2
PosA         1
RRAn         1
RRAe         1
Name: condition2, dtype: int64



bldgtype


1Fam      1220
TwnhsE     114
Duplex      52
Twnhs       43
2fmCon      31
Name: bldgtype, dtype: int64



housestyle


1Story    726
2Story    445
1.5Fin    154
SLvl       65
SFoyer     37
1.5Unf     14
2.5Unf     11
2.5Fin      8
Name: housestyle, dtype: int64



roofstyle


Gable      1141
Hip         286
Flat         13
Gambrel      11
Mansard       7
Shed          2
Name: roofstyle, dtype: int64



roofmatl


CompShg    1434
Tar&Grv      11
WdShngl       6
WdShake       5
Metal         1
Membran       1
Roll          1
ClyTile       1
Name: roofmatl, dtype: int64



exterior1st


VinylSd    515
HdBoard    222
MetalSd    220
Wd Sdng    206
Plywood    108
CemntBd     61
BrkFace     50
WdShing     26
Stucco      25
AsbShng     20
BrkComm      2
Stone        2
AsphShn      1
ImStucc      1
CBlock       1
Name: exterior1st, dtype: int64



exterior2nd


VinylSd    504
MetalSd    214
HdBoard    207
Wd Sdng    197
Plywood    142
CmentBd     60
Wd Shng     38
Stucco      26
BrkFace     25
AsbShng     20
ImStucc     10
Brk Cmn      7
Stone        5
AsphShn      3
Other        1
CBlock       1
Name: exterior2nd, dtype: int64



masvnrtype


None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: masvnrtype, dtype: int64



exterqual


TA    906
Gd    488
Ex     52
Fa     14
Name: exterqual, dtype: int64



extercond


TA    1282
Gd     146
Fa      28
Ex       3
Po       1
Name: extercond, dtype: int64



foundation


PConc     647
CBlock    634
BrkTil    146
Slab       24
Stone       6
Wood        3
Name: foundation, dtype: int64



bsmtqual


TA    649
Gd    618
Ex    121
Fa     35
Name: bsmtqual, dtype: int64



bsmtcond


TA    1311
Gd      65
Fa      45
Po       2
Name: bsmtcond, dtype: int64



bsmtexposure


No    953
Av    221
Gd    134
Mn    114
Name: bsmtexposure, dtype: int64



bsmtfintype1


Unf    430
GLQ    418
ALQ    220
BLQ    148
Rec    133
LwQ     74
Name: bsmtfintype1, dtype: int64



bsmtfintype2


Unf    1256
Rec      54
LwQ      46
BLQ      33
ALQ      19
GLQ      14
Name: bsmtfintype2, dtype: int64



heating


GasA     1428
GasW       18
Grav        7
Wall        4
OthW        2
Floor       1
Name: heating, dtype: int64



heatingqc


Ex    741
TA    428
Gd    241
Fa     49
Po      1
Name: heatingqc, dtype: int64



centralair


Y    1365
N      95
Name: centralair, dtype: int64



electrical


SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: electrical, dtype: int64



kitchenqual


TA    735
Gd    586
Ex    100
Fa     39
Name: kitchenqual, dtype: int64



functional


Typ     1360
Min2      34
Min1      31
Mod       15
Maj1      14
Maj2       5
Sev        1
Name: functional, dtype: int64



fireplacequ


Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: fireplacequ, dtype: int64



garagetype


Attchd     870
Detchd     387
BuiltIn     88
Basment     19
CarPort      9
2Types       6
Name: garagetype, dtype: int64



garagefinish


Unf    605
RFn    422
Fin    352
Name: garagefinish, dtype: int64



garagequal


TA    1311
Fa      48
Gd      14
Ex       3
Po       3
Name: garagequal, dtype: int64



garagecond


TA    1326
Fa      35
Gd       9
Po       7
Ex       2
Name: garagecond, dtype: int64



paveddrive


Y    1340
N      90
P      30
Name: paveddrive, dtype: int64



poolqc


Gd    3
Ex    2
Fa    2
Name: poolqc, dtype: int64



fence


MnPrv    157
GdPrv     59
GdWo      54
MnWw      11
Name: fence, dtype: int64



miscfeature


Shed    49
Gar2     2
Othr     2
TenC     1
Name: miscfeature, dtype: int64



saletype


WD       1267
New       122
COD        43
ConLD       9
ConLI       5
ConLw       5
CWD         4
Oth         3
Con         2
Name: saletype, dtype: int64



salecondition


Normal     1198
Partial     125
Abnorml     101
Family       20
Alloca       12
AdjLand       4
Name: salecondition, dtype: int64





Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
count,1460.0,1460.0,1460,1201.0,1460.0,1460,91,1460,1460,1460,...,1460.0,7,281,54,1460.0,1460.0,1460.0,1460,1460,1460.0
unique,,,5,,,2,2,4,4,2,...,,3,4,4,,,,9,6,
top,,,RL,,,Pave,Grvl,Reg,Lvl,AllPub,...,,Gd,MnPrv,Shed,,,,WD,Normal,
freq,,,1151,,,1454,50,925,1311,1459,...,,3,157,49,,,,1267,1198,
mean,730.5,56.89726,,70.049958,10516.828082,,,,,,...,2.758904,,,,43.489041,6.321918,2007.815753,,,180921.19589
std,421.610009,42.300571,,24.284752,9981.264932,,,,,,...,40.177307,,,,496.123024,2.703626,1.328095,,,79442.502883
min,1.0,20.0,,21.0,1300.0,,,,,,...,0.0,,,,0.0,1.0,2006.0,,,34900.0
25%,365.75,20.0,,59.0,7553.5,,,,,,...,0.0,,,,0.0,5.0,2007.0,,,129975.0
50%,730.5,50.0,,69.0,9478.5,,,,,,...,0.0,,,,0.0,6.0,2008.0,,,163000.0
75%,1095.25,70.0,,80.0,11601.5,,,,,,...,0.0,,,,0.0,8.0,2009.0,,,214000.0


In [6]:
# Drop columns missing more than 40% of data
na_thresh = .6
train_df = train_df.dropna(axis=1, thresh=train_df.shape[0] * na_thresh)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1460 non-null   int64  
 1   mssubclass     1460 non-null   int64  
 2   mszoning       1460 non-null   object 
 3   lotfrontage    1201 non-null   float64
 4   lotarea        1460 non-null   int64  
 5   street         1460 non-null   object 
 6   lotshape       1460 non-null   object 
 7   landcontour    1460 non-null   object 
 8   utilities      1460 non-null   object 
 9   lotconfig      1460 non-null   object 
 10  landslope      1460 non-null   object 
 11  neighborhood   1460 non-null   object 
 12  condition1     1460 non-null   object 
 13  condition2     1460 non-null   object 
 14  bldgtype       1460 non-null   object 
 15  housestyle     1460 non-null   object 
 16  overallqual    1460 non-null   int64  
 17  overallcond    1460 non-null   int64  
 18  yearbuil

In [7]:
# drop columns unlikely to be helpful in prediction
drop_cols = ['id','street','utilities','condition2','roofmatl']
train_df = train_df.drop(columns=drop_cols)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 71 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   mssubclass     1460 non-null   int64  
 1   mszoning       1460 non-null   object 
 2   lotfrontage    1201 non-null   float64
 3   lotarea        1460 non-null   int64  
 4   lotshape       1460 non-null   object 
 5   landcontour    1460 non-null   object 
 6   lotconfig      1460 non-null   object 
 7   landslope      1460 non-null   object 
 8   neighborhood   1460 non-null   object 
 9   condition1     1460 non-null   object 
 10  bldgtype       1460 non-null   object 
 11  housestyle     1460 non-null   object 
 12  overallqual    1460 non-null   int64  
 13  overallcond    1460 non-null   int64  
 14  yearbuilt      1460 non-null   int64  
 15  yearremodadd   1460 non-null   int64  
 16  roofstyle      1460 non-null   object 
 17  exterior1st    1460 non-null   object 
 18  exterior

# Modeling

In [8]:
X_train, X_test, y_train, y_test = split_data(train_df, 'saleprice')
X_train.shape

(1095, 70)

## Preprocessor

In [9]:
cat_sel = make_column_selector(dtype_include='object')
num_sel = make_column_selector(dtype_include='number')

scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
missing_imputer = SimpleImputer(strategy='constant', fill_value='missing')
mean_imputer = SimpleImputer(strategy='mean')

In [10]:
cat_pipe = make_pipeline(missing_imputer, ohe)
num_pipe = make_pipeline(mean_imputer, scaler)

preprocessor = make_column_transformer((cat_pipe, cat_sel),
                                       (num_pipe, num_sel))

np.isnan(preprocessor.fit_transform(X_train)).sum()

0

## Model 1: Linear Regression

In [11]:
lin_reg_pipe = make_pipeline(preprocessor, LinearRegression())

data = (X_train, X_test, y_train, y_test)
lin_model, linreg_scores = try_regmodel(lin_reg_pipe, X_train, X_test, y_train, y_test)
display(linreg_scores)

Unnamed: 0,R2,MAE,MSE,RMSE
Train,0.9003669,15684.56,604917000.0,24595.06
Test,-1.279652e+21,178641400000000.0,8.964359999999999e+30,2994054000000000.0


### Turning Linear Regression: L1 (Lasso)

In [12]:
lasso_pipe = make_pipeline(preprocessor, Lasso())

lasso_params = {'lasso__alpha':[.01, .1, 1, 10, 100]}

lasso, lasso_scores = try_regmodel(lasso_pipe, X_train, X_test, y_train, y_test, param_grid = lasso_params)

display(lasso_scores)
lasso.best_estimator_.get_params()['lasso']

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rn

Unnamed: 0,R2,MAE,MSE,RMSE
Train,0.888363,16000.281071,677799500.0,26034.582433
Test,0.877497,18596.726221,858170200.0,29294.542098


Lasso(alpha=100)

# Tuning Linear Regression: L2 (Ridge)

In [13]:
ridge_pipe = make_pipeline(preprocessor, Ridge())

ridge_params = {'ridge__alpha':[.01, .1, 1, 10, 100]}

ridge, ridge_scores = try_regmodel(ridge_pipe, X_train, X_test, y_train, y_test, param_grid = ridge_params)

display(ridge_scores)
ridge.best_estimator_.get_params()['ridge']

Unnamed: 0,R2,MAE,MSE,RMSE
Train,0.862076,16589.954914,837398900.0,28937.845676
Test,0.866679,18266.965693,933951500.0,30560.619361


Ridge(alpha=100)

## Model 2: XGBoost

In [14]:
%%time
from xgboost import XGBRegressor

xgb_pipe = make_pipeline(preprocessor, XGBRegressor(n_jobs=-1,
                                                    objective='reg:squarederror'))

xgb, xgb_score = try_regmodel(xgb_pipe, X_train, X_test, y_train, y_test)

display(xgb_score)


Unnamed: 0,R2,MAE,MSE,RMSE
Train,0.966468,10164.818415,203586600.0,14268.377636
Test,0.905213,16280.972389,664011900.0,25768.428531


CPU times: user 3.17 s, sys: 13.4 ms, total: 3.19 s
Wall time: 3.45 s


In [15]:
xgb_params = {'xgbregressor__max_depth': range(1, 10)
              }

xgb_gs, xgb_score = try_regmodel(xgb_pipe, X_train, X_test, y_train, y_test, param_grid=xgb_params)

display(xgb_score)
xgb_gs.best_estimator_.named_steps['xgbregressor'].get_params()

Unnamed: 0,R2,MAE,MSE,RMSE
Train,0.994834,4117.699704,31362340.0,5600.208833
Test,0.912755,15638.016995,611175300.0,24721.959829


{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'importance_type': 'gain',
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 6,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': -1,
 'nthread': None,
 'objective': 'reg:squarederror',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'verbosity': 1}

# Prepare Data for Submission

## Load the Submission Test File

In [16]:
test_path = '/content/test.csv'
test_df = pd.read_csv(test_path)

# lower case columns

test_df.columns = test_df.columns.str.lower()
test_df.columns

Index(['id', 'mssubclass', 'mszoning', 'lotfrontage', 'lotarea', 'street',
       'alley', 'lotshape', 'landcontour', 'utilities', 'lotconfig',
       'landslope', 'neighborhood', 'condition1', 'condition2', 'bldgtype',
       'housestyle', 'overallqual', 'overallcond', 'yearbuilt', 'yearremodadd',
       'roofstyle', 'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype',
       'masvnrarea', 'exterqual', 'extercond', 'foundation', 'bsmtqual',
       'bsmtcond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfinsf1',
       'bsmtfintype2', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'heating',
       'heatingqc', 'centralair', 'electrical', '1stflrsf', '2ndflrsf',
       'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath',
       'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'kitchenqual',
       'totrmsabvgrd', 'functional', 'fireplaces', 'fireplacequ', 'garagetype',
       'garageyrblt', 'garagefinish', 'garagecars', 'garagearea', 'garagequal',
       'garagecond', 'paveddrive

In [17]:
# Isolate test features
test_features = test_df[X_train.columns]
test_features.head()

Unnamed: 0,mssubclass,mszoning,lotfrontage,lotarea,lotshape,landcontour,lotconfig,landslope,neighborhood,condition1,...,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,miscval,mosold,yrsold,saletype,salecondition
0,20,RH,80.0,11622,Reg,Lvl,Inside,Gtl,NAmes,Feedr,...,0,0,0,120,0,0,6,2010,WD,Normal
1,20,RL,81.0,14267,IR1,Lvl,Corner,Gtl,NAmes,Norm,...,36,0,0,0,0,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,IR1,Lvl,Inside,Gtl,Gilbert,Norm,...,34,0,0,0,0,0,3,2010,WD,Normal
3,60,RL,78.0,9978,IR1,Lvl,Inside,Gtl,Gilbert,Norm,...,36,0,0,0,0,0,6,2010,WD,Normal
4,120,RL,43.0,5005,IR1,HLS,Inside,Gtl,StoneBr,Norm,...,82,0,0,144,0,0,1,2010,WD,Normal


In [18]:
# Create submission df
submission_df = test_df[['id']]
submission_df.head()

Unnamed: 0,id
0,1461
1,1462
2,1463
3,1464
4,1465


## Process and predict sale price features using the best model


In [19]:
# Predict SalePrice
submission_preds = xgb.predict(test_features)
submission_df['SalePrice'] = submission_preds
submission_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,SalePrice
0,1461,125465.570312
1,1462,158691.75
2,1463,170905.09375
3,1464,184529.140625
4,1465,200474.09375


# Compare my submission df with sample submission df

In [20]:
# Examine sample submission
sample = pd.read_csv('/content/sample_submission.csv')
sample.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


## Fix the column names

In [21]:
submission_df.columns=['Id','SalePrice']
submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,125465.570312
1,1462,158691.75
2,1463,170905.09375
3,1464,184529.140625
4,1465,200474.09375


# Save my prediction submission to upload to Kaggle

In [22]:
submission_df.to_csv('/content/submission.csv')