In [1]:
# basics
import numpy as np
import pandas as pd
import scipy.stats as stats

# graphing
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# preprocessing
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler

# model selection
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score

from sklearn.metrics import (r2_score, mean_squared_error, accuracy_score, precision_score, recall_score, make_scorer,
                             f1_score, roc_auc_score, roc_curve, precision_recall_curve)

# models
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression

# enable reload of changed files
%load_ext autoreload
%autoreload 2

# plot inline
%matplotlib inline

In [2]:
train_df = pd.read_csv('data/Train.csv', index_col='SalesID')
test_df = pd.read_csv('data/test.csv', index_col='SalesID')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
y = train_df.SalePrice

In [4]:
X = train_df.loc[:,['YearMade', 'MachineHoursCurrentMeter']]
X['MachineHoursCurrentMeter'] = X.MachineHoursCurrentMeter.fillna(X.MachineHoursCurrentMeter.mean())
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 401125 entries, 1139246 to 6333342
Data columns (total 2 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   YearMade                  401125 non-null  int64  
 1   MachineHoursCurrentMeter  401125 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 9.2 MB


In [5]:
y.head(2)

SalesID
1139246    66000
1139248    57000
Name: SalePrice, dtype: int64

In [6]:
X.head(2)

Unnamed: 0_level_0,YearMade,MachineHoursCurrentMeter
SalesID,Unnamed: 1_level_1,Unnamed: 2_level_1
1139246,2004,68.0
1139248,1996,4640.0


In [7]:
# you already have a "hold-out" set

In [8]:
def rmsle(actual, predictions):
    log_diff = np.log(predictions+1) - np.log(actual+1)
    return np.sqrt(np.mean(log_diff**2))

In [9]:
score = make_scorer(rmsle, greater_is_better=False)

In [10]:
parameters = {}
model = LinearRegression()
gs = GridSearchCV(model, parameters, cv=5, n_jobs=-1, verbose=1, scoring=score)
gs.fit(X, y)
gs.best_score_, gs.best_params_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


(-0.7236173531546051, {})

In [11]:
final = LinearRegression().fit(X, y)

In [12]:
Xt = test_df.loc[:,['YearMade', 'MachineHoursCurrentMeter']]

In [13]:
Xt['MachineHoursCurrentMeter'] = Xt.MachineHoursCurrentMeter.fillna(Xt.MachineHoursCurrentMeter.mean())

In [14]:
Xt.head()

Unnamed: 0_level_0,YearMade,MachineHoursCurrentMeter
SalesID,Unnamed: 1_level_1,Unnamed: 2_level_1
1222837,1000,0.0
1222839,2006,4412.0
1222841,2000,10127.0
1222843,1000,4682.0
1222845,2002,8150.0


In [15]:
final.predict(Xt)

array([19940.97230702, 32433.09593706, 32437.19241221, ...,
       32435.38926702, 32447.74684216, 32447.74684216])