In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from  tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV



In [2]:
df = pd.read_csv('../data/train.csv', index_col=0, na_values='?')
test = pd.read_csv('../data/test.csv', index_col=0, na_values='?')

mean = np.mean(df['horsepower'])

df['horsepower'].fillna(mean, inplace=True)


origin = df.pop('origin')
df['USA'] = (origin==1) * 1.0
df['Europe']= (origin==2) * 1.0
df['Japan']= (origin==3) * 1.0

train = df[['cylinders',
             'displacement',
             'horsepower',
             'weight',
             'acceleration',
             'model year',
             'USA',
             'Europe',
             'Japan']]

Y = df[['mpg']]

trainX, testX, trainY, testY = train_test_split(train, Y)


In [7]:
reg = RandomForestRegressor()

est_range_exp = [1e-1, 1, 1e2, 1e5]
RFR_grid = {"n_estimators": est_range_exp,
                 "max_depth":[i for i in range(1,15)],
                 "random_state": [i for i in range(0,10)]}



In [8]:
gs = GridSearchCV(reg, RFR_grid, n_jobs=-1, verbose=2, return_train_score=True)

gs.fit(trainX, trainY)


Fitting 5 folds for each of 560 candidates, totalling 2800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 2800 out of 2800 | elapsed:    5.6s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14],
                         'n_estimators': [0.1, 1, 100.0, 100000.0],
                         'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]},
             return_train_score=True, verbose=2)

In [9]:
gs.cv_results_

{'mean_fit_time': array([0.00693545, 0.00669689, 0.00686364, 0.0066926 , 0.00825582,
        0.00862985, 0.00588675, 0.00735373, 0.01051087, 0.00845838,
        0.01349769, 0.01431918, 0.01093473, 0.01217794, 0.01171122,
        0.01372418, 0.01223583, 0.01631656, 0.01521339, 0.01217999,
        0.00863309, 0.01318927, 0.00811934, 0.00748911, 0.01043787,
        0.0082293 , 0.00776849, 0.01580482, 0.00878301, 0.00917373,
        0.01051445, 0.01076813, 0.00782933, 0.00767813, 0.00747147,
        0.0086071 , 0.01079087, 0.00944428, 0.00986094, 0.0107049 ,
        0.0122344 , 0.00700407, 0.00844922, 0.00690413, 0.01089158,
        0.00832353, 0.00811286, 0.00726361, 0.00922389, 0.01043539,
        0.01161523, 0.01186075, 0.01195683, 0.01358337, 0.01203804,
        0.01134329, 0.01088285, 0.01133409, 0.01200171, 0.01306019,
        0.00980434, 0.00807548, 0.00904365, 0.0091393 , 0.0142189 ,
        0.00830836, 0.00809817, 0.01174483, 0.00892959, 0.00863299,
        0.00805984, 0.00836043,

In [10]:
gs.best_estimator_

RandomForestRegressor(max_depth=7, n_estimators=1, random_state=6)

In [11]:
reg.fit(trainX, trainY)

  reg.fit(trainX, trainY)


RandomForestRegressor()

In [12]:
reg.score(testX, testY)

0.8693495473656578

In [15]:
reg = RandomForestRegressor(max_depth=7, n_estimators=100, random_state=6)


In [16]:
reg.fit(trainX, trainY)
reg.score(testX, testY)


  reg.fit(trainX, trainY)


0.8686755412501279

In [17]:
testX = test[['cylinders', 
            'displacement', 
            'horsepower', 
            'weight',
            'acceleration','model year', 'origin']]



In [18]:
origin = testX.pop('origin')
testX['USA'] = (origin==1) * 1.0
testX['Europe']= (origin==2) * 1.0
testX['Japan']= (origin==3) * 1.0

testX['horsepower'].fillna(mean, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testX['USA'] = (origin==1) * 1.0


In [20]:
pred = reg.predict(testX)

sample = pd.read_csv("../data/sample_submit.csv", header=None)

sample[1] = pred

sample.to_csv('submit.csv', index=None, header=None)
