# Table of Contents (Continued from previous notebook..)

[10. Load Data, Variables, Functions, Libraries from previous notebook](#section10)<br>
[11. Load Competition Test and Competition Train Dataset and Preprocess for Modelling](#section11)<br>
[12. Train Model on Entire Competition Train Dataset](#section12)<br>
[13. Predict on Competition Test and Output the Submission File](#section13)<br>

# <a id = 'section10'>10. Load Data,Variables,Functions,Libraries from previous notebook</a>

In [1]:
#general purpose libraries...
import os
#print(os.getcwd())
import warnings
warnings.filterwarnings('ignore')
import pickle
from prettytable import PrettyTable
import math
import re
from tqdm import tqdm,tqdm_notebook
import itertools


#data analysis libraries...
import pandas as pd
import numpy as np 
import scipy
from scipy import sparse
from scipy.sparse import hstack



#visualization libraries...
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE


#Modelling Libraries....
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer,mean_squared_error,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier,LinearRegression,SGDRegressor
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from xgboost import XGBClassifier,XGBRegressor
from sklearn.feature_selection import SelectKBest,f_regression,RFECV,RFE,SelectFromModel
from sklearn.decomposition import TruncatedSVD


#nlp libraries...
from collections import Counter 
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.stem import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


#Deep Learning Libraries..
#from keras.preprocessing.text import Tokenizer                    
#from keras.preprocessing.sequence import pad_sequences
#from keras.models import Sequential
#from keras import layers
#from keras.layers import Dense
#from keras.layers import LSTM
#from keras.layers.embeddings import Embedding
#from keras.preprocessing import sequence


#Multicollinearity 
from scipy import stats
import scipy.stats as ss
import dython
from dython.nominal import associations

#### Defining RMSE 

In [2]:
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error((y_true), (y_pred)))

rmse_scorer = make_scorer(rmse_score, greater_is_better=False)

#### Defining RMSLE 

In [3]:
def rmsle_score(real, predicted):
    sum=0.0
    for x in range(len(predicted)):
        if predicted[x]<0 or real[x]<0: #check for negative values
            continue
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

rmsle_scorer = make_scorer(rmsle_score, greater_is_better=False)

# <a id = 'section10'>11. Load Competition Test and Competition Train Dataset and Preprocess for Modelling</a>

In [5]:
competition_test = pd.read_csv('Data_Test_csv.csv',encoding = 'cp1252')
print(competition_test.shape)
competition_test.head(2)

(1234, 12)


Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,


In [9]:
#Check Missing values...

print("Name column missing ",competition_test['Name'].isnull().sum())
print("Location column missing ",competition_test['Location'].isnull().sum())
print("Year column missing ",competition_test['Year'].isnull().sum())
print("Kms column missing ",competition_test['Kilometers_Driven'].isnull().sum())
print("Fuel column missing ",competition_test['Fuel_Type'].isnull().sum())
print("Transmission column missing ",competition_test['Transmission'].isnull().sum())
print("Owner column missing ",competition_test['Owner_Type'].isnull().sum())
print("Mileage column missing ",competition_test['Mileage'].isnull().sum())
print('\n\n')
print("Engine column missing ",competition_test['Engine'].isnull().sum())
print("Power column missing ",competition_test['Power'].isnull().sum())
print("Seats column missing ",competition_test['Seats'].isnull().sum())

Name column missing  0
Location column missing  0
Year column missing  0
Kms column missing  0
Fuel column missing  0
Transmission column missing  0
Owner column missing  0
Mileage column missing  0



Engine column missing  10
Power column missing  10
Seats column missing  11


In [10]:
#with open('df_enhanced.pickle', 'wb') as f:
#    pickle.dump(df, f)


#infile = open('df_enhanced.pickle','rb')
#competition_train = pickle.load(infile)
#infile.close()

## 11.1  Preprocessing of features on 'Competition Test' Dataset

In [12]:
competition_test.head(2)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,


#### Feature 'Age'

In [15]:
competition_test['age'] = 2019 - competition_test['Year']
competition_test.age.describe()

count    1234.000000
mean        5.599676
std         3.179700
min         0.000000
25%         3.000000
50%         5.000000
75%         8.000000
max        23.000000
Name: age, dtype: float64

#### Feature 'kms_driven'

In [17]:
competition_test.Kilometers_Driven.describe()

count      1234.000000
mean      58507.288493
std       35598.702098
min        1000.000000
25%       34000.000000
50%       54572.500000
75%       75000.000000
max      350000.000000
Name: Kilometers_Driven, dtype: float64

In [21]:
kms_list = list(competition_test.Kilometers_Driven.values)
kms_list.sort(reverse = True)
print('Highest 10 values in kms_driven are ',kms_list[:10])

Highest 10 values in kms_driven are  [350000, 290000, 205000, 200000, 200000, 200000, 196000, 194500, 194000, 190000]


#### Feature 'Mileage'

In [22]:
print("Check Missing Values in Mileage Column ",competition_test['Mileage'].isnull().sum())
competition_test['Mileage'] = competition_test['Mileage'].fillna('0.0 kmpl')


new = competition_test["Mileage"].str.split(" ", n = 1, expand = True)
competition_test["mileage_amount"]= new[0] 
# making separate last name column from new data frame 
competition_test["mileage_unit"]= new[1] 
competition_test['mileage_amount'] = pd.to_numeric(competition_test['mileage_amount']) 

competition_test = competition_test.drop(['mileage_unit'],axis = 1)

Check Missing Values in Mileage Column  0


#### Feature 'Engine'

In [27]:
print("Check Missing Values in Engine Column ",competition_test['Engine'].isnull().sum())
competition_test['Engine'] = competition_test['Engine'].fillna('0 CC')


new = competition_test["Engine"].str.split(" ", n = 1, expand = True)
competition_test["engine_amount"]= new[0] 
# making separate last name column from new data frame 
competition_test["engine_unit"]= new[1] 
competition_test['engine_amount'] = pd.to_numeric(competition_test['engine_amount']) 

competition_test = competition_test.drop(['engine_unit'],axis = 1)

#plt.figure(figsize = (15,7))
#sns.scatterplot(x = 'engine_amount', y = 'log_price', data = df)
print("Check Missing Values in Engine Column ",competition_test['Engine'].isnull().sum())

Check Missing Values in Engine Column  10
Check Missing Values in Engine Column  0


#### Feature 'Power'

In [28]:
print("Check Missing Values in Power Column ",competition_test['Power'].isnull().sum())
competition_test['Power'] = competition_test['Power'].fillna('0 bhp')


new = competition_test["Power"].str.split(" ", n = 1, expand = True)
competition_test["power_amount"]= new[0] 
competition_test["power_amount"] = competition_test["power_amount"].replace(['null'],0)
# making separate last name column from new data frame 
competition_test["power_unit"]= new[1] 
competition_test['power_amount'] = pd.to_numeric(competition_test['power_amount']) 

competition_test = competition_test.drop(['power_unit'],axis = 1)

#plt.figure(figsize = (15,7))
#sns.scatterplot(x = 'power_amount', y = 'log_price', data = df)
print("Check Missing Values in Power Column ",competition_test['Power'].isnull().sum())

Check Missing Values in Power Column  10
Check Missing Values in Power Column  0


#### Feature 'Seats'

In [37]:
competition_test['Seats'] = competition_test['Seats'].fillna(0)
competition_test['Seats_char'] = competition_test.Seats.astype(int).astype(str)

In [43]:
stats.mode(competition_test['Seats_char'][competition_test['Seats_char'] != '0'])[0][0]

'5'

#### Missing Values 'Imputation'

In [50]:
competition_test['mileage_amount'] = competition_test['mileage_amount'].replace(0,competition_test.mileage_amount.mean()) 
competition_test['engine_amount'] = competition_test['engine_amount'].replace(0,competition_test.engine_amount.mean()) 
competition_test['power_amount'] = competition_test['power_amount'].replace(0,competition_test.power_amount.mean()) 
competition_test['Seats_char'][competition_test['Seats_char'] == '0'] = '5' 

In [52]:
df_missing = competition_test[(competition_test.mileage_amount == 0) | 
                              (competition_test.engine_amount == 0) | 
                              (competition_test.power_amount == 0) | 
                              (competition_test.Seats == '0')]
print('rows haveing missing value of atleast 1 column',df_missing.shape)

rows haveing missing value of atleast 1 column (0, 17)


#### Feature 'preprocessed_names'

In [53]:
competition_test.Name = competition_test.Name.str.lower()
#reference - www.appliedaicourse.com
# https://stackoverflow.com/a/47091490/4084039
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords_eng = set(stopwords.words('english'))

preprocessed_names = []
# tqdm is for printing the status bar
for sentance in tqdm(competition_test['Name'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split() if e not in stopwords_eng)
    preprocessed_names.append(sent.lower().strip())
    
competition_test['preprocessed_names'] = preprocessed_names

100%|██████████| 1234/1234 [00:00<00:00, 64821.92it/s]


## 11.2  Creating Feature Set for 'Train' and 'Test' (Competition Dataset)

In [58]:
competition_train = competition_train[['preprocessed_names',
    'Location','Fuel_Type','Transmission','Owner_Type','Seats_imputation',
             'Kilometers_Driven','engine_imputation','power_imputation','mileage_imputation','age',
             'Price']]

competition_test = competition_test[['preprocessed_names',
    'Location','Fuel_Type','Transmission','Owner_Type','Seats_char',
             'Kilometers_Driven','engine_amount','power_amount','mileage_amount','age']]

##### Featurizing 'Names' - BOW

In [61]:
vectorizer = CountVectorizer(ngram_range = (1,4))

train_title_bow = vectorizer.fit_transform(competition_train.preprocessed_names.values)
test_title_bow = vectorizer.transform(competition_test.preprocessed_names.values)

print("Shape of Train matrix after one hot encodig ",train_title_bow.shape)
print("Shape of Test matrix after one hot encodig ",test_title_bow.shape)

Shape of Train matrix after one hot encodig  (6011, 7338)
Shape of Test matrix after one hot encodig  (1234, 7338)


##### Featurizing 'Location' - OHE

In [62]:
#TRAIN DATA....
vectorizer = CountVectorizer(vocabulary= list(competition_train['Location'].unique()), lowercase=False, binary=True)
vectorizer.fit(competition_train['Location'].values)
location_one_hot_features = vectorizer.get_feature_names()
train_loc_one_hot = vectorizer.transform(competition_train['Location'].values)
print("Shape of Train matrix after one hot encodig ",train_loc_one_hot.shape)


#TEST DATA.....
test_loc_one_hot = vectorizer.transform(competition_test['Location'].values)
print("Shape of Test matrix after one hot encodig ",test_loc_one_hot.shape)

Shape of Train matrix after one hot encodig  (6011, 11)
Shape of Test matrix after one hot encodig  (1234, 11)


##### Featurizing ''Fuel Type" - OHE

In [63]:
#TRAIN DATA....
vectorizer = CountVectorizer(vocabulary= list(competition_train['Fuel_Type'].unique()), lowercase=False, binary=True)
vectorizer.fit(competition_train['Fuel_Type'].values)
fuel_one_hot_features = vectorizer.get_feature_names()
train_fuel_one_hot = vectorizer.transform(competition_train['Fuel_Type'].values)
print("Shape of Train matrix after one hot encodig ",train_fuel_one_hot.shape)


#TEST DATA.....
test_fuel_one_hot = vectorizer.transform(competition_test['Fuel_Type'].values)
print("Shape of Test matrix after one hot encodig ",test_fuel_one_hot.shape)

Shape of Train matrix after one hot encodig  (6011, 5)
Shape of Test matrix after one hot encodig  (1234, 5)


##### Featurizing ''Transmission"

In [64]:
#TRAIN DATA....
vectorizer = CountVectorizer(vocabulary= list(competition_train['Transmission'].unique()), lowercase=False, binary=True)
vectorizer.fit(competition_train['Transmission'].values)
transmission_one_hot_features = vectorizer.get_feature_names()
train_transmission_one_hot = vectorizer.transform(competition_train['Transmission'].values)
print("Shape of Train matrix after one hot encodig ",train_transmission_one_hot.shape)


#TEST DATA.....
test_transmission_one_hot = vectorizer.transform(competition_test['Transmission'].values)
print("Shape of Test matrix after one hot encodig ",test_transmission_one_hot.shape)

Shape of Train matrix after one hot encodig  (6011, 2)
Shape of Test matrix after one hot encodig  (1234, 2)


##### Featurizing ''Owner Type"

In [65]:
#TRAIN DATA....
vectorizer = CountVectorizer(vocabulary= list(competition_train['Owner_Type'].unique()), lowercase=False, binary=True)
vectorizer.fit(competition_train['Owner_Type'].values)
owner_one_hot_features = vectorizer.get_feature_names()
train_owner_one_hot = vectorizer.transform(competition_train['Owner_Type'].values)
print("Shape of Train matrix after one hot encodig ",train_owner_one_hot.shape)


#TEST DATA.....
test_owner_one_hot = vectorizer.transform(competition_test['Owner_Type'].values)
print("Shape of Test matrix after one hot encodig ",test_owner_one_hot.shape)

Shape of Train matrix after one hot encodig  (6011, 4)
Shape of Test matrix after one hot encodig  (1234, 4)


##### Featurizing ''Seats"

In [67]:
#TRAIN DATA....
vectorizer = CountVectorizer(vocabulary= list(competition_train['Seats_imputation'].unique()), lowercase=False, binary=True)
vectorizer.fit(competition_train['Seats_imputation'].values)
seats_one_hot_features = vectorizer.get_feature_names()
train_seats_one_hot = vectorizer.transform(competition_train['Seats_imputation'].values)
print("Shape of Train matrix after one hot encodig ",train_seats_one_hot.shape)


#TEST DATA.....
test_seats_one_hot = vectorizer.transform(competition_test['Seats_char'].values)
print("Shape of Test matrix after one hot encodig ",test_seats_one_hot.shape)

Shape of Train matrix after one hot encodig  (6011, 8)
Shape of Test matrix after one hot encodig  (1234, 8)


##### Featurizing ''kms_driven"

In [68]:
#TRAIN DATA....

train_kms_scalar = StandardScaler()
train_kms_scalar.fit(competition_train['Kilometers_Driven'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {train_kms_scalar.mean_[0]}, Standard deviation : {np.sqrt(train_kms_scalar.var_[0])}")
train_kms_standardized = train_kms_scalar.transform(competition_train['Kilometers_Driven'].values.reshape(-1, 1))


#TEST DATA....
test_kms_scalar = StandardScaler()
test_kms_scalar.fit(competition_test['Kilometers_Driven'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {test_kms_scalar.mean_[0]}, Standard deviation : {np.sqrt(test_kms_scalar.var_[0])}")
test_kms_standardized = test_kms_scalar.transform(competition_test['Kilometers_Driven'].values.reshape(-1, 1))

Mean : 57099.70237897189, Standard deviation : 33625.09186770184
Mean : 58507.28849270665, Standard deviation : 35584.27506487185


##### Featurizing ''engine_imputation"

In [70]:
#TRAIN DATA....

train_engine_scalar = StandardScaler()
train_engine_scalar.fit(competition_train['engine_imputation'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {train_engine_scalar.mean_[0]}, Standard deviation : {np.sqrt(train_engine_scalar.var_[0])}")
train_engine_standardized = train_engine_scalar.transform(competition_train['engine_imputation'].values.reshape(-1, 1))


#TEST DATA....
test_engine_scalar = StandardScaler()
test_engine_scalar.fit(competition_test['engine_amount'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {test_engine_scalar.mean_[0]}, Standard deviation : {np.sqrt(test_engine_scalar.var_[0])}")
test_engine_standardized = test_engine_scalar.transform(competition_test['engine_amount'].values.reshape(-1, 1))

Mean : 1621.1337238493727, Standard deviation : 599.5498816457016
Mean : 1593.4803159534422, Standard deviation : 561.8279593265466


##### Featurizing ''power_imputation"

In [71]:
#TRAIN DATA....

train_power_scalar = StandardScaler()
train_power_scalar.fit(competition_train['power_imputation'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {train_power_scalar.mean_[0]}, Standard deviation : {np.sqrt(train_power_scalar.var_[0])}")
train_power_standardized = train_power_scalar.transform(competition_train['power_imputation'].values.reshape(-1, 1))


#TEST DATA....
test_power_scalar = StandardScaler()
test_power_scalar.fit(competition_test['power_amount'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {test_power_scalar.mean_[0]}, Standard deviation : {np.sqrt(test_power_scalar.var_[0])}")
test_power_standardized = test_power_scalar.transform(competition_test['power_amount'].values.reshape(-1, 1))

Mean : 113.20596180864602, Standard deviation : 53.81244582518368
Mean : 110.30619317211686, Standard deviation : 50.85198226954202


##### Featurizing ''mileage_imputation"

In [72]:
#TRAIN DATA....

train_mileage_scalar = StandardScaler()
train_mileage_scalar.fit(competition_train['mileage_imputation'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {train_mileage_scalar.mean_[0]}, Standard deviation : {np.sqrt(train_mileage_scalar.var_[0])}")
train_mileage_standardized = train_mileage_scalar.transform(competition_train['mileage_imputation'].values.reshape(-1, 1))


#TEST DATA....
test_mileage_scalar = StandardScaler()
test_mileage_scalar.fit(competition_test['mileage_amount'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {test_mileage_scalar.mean_[0]}, Standard deviation : {np.sqrt(test_mileage_scalar.var_[0])}")
test_mileage_standardized = test_mileage_scalar.transform(competition_test['mileage_amount'].values.reshape(-1, 1))

Mean : 18.326911587073138, Standard deviation : 4.161043556846434
Mean : 18.3653161635876, Standard deviation : 4.049744650812627


##### Featurizing ''Age"

In [73]:
#TRAIN DATA....

train_age_scalar = StandardScaler()
train_age_scalar.fit(competition_train['age'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {train_age_scalar.mean_[0]}, Standard deviation : {np.sqrt(train_age_scalar.var_[0])}")
train_age_standardized = train_age_scalar.transform(competition_train['age'].values.reshape(-1, 1))


#TEST DATA....
test_age_scalar = StandardScaler()
test_age_scalar.fit(competition_test['age'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {test_age_scalar.mean_[0]}, Standard deviation : {np.sqrt(test_age_scalar.var_[0])}")
test_age_standardized = test_age_scalar.transform(competition_test['age'].values.reshape(-1, 1))

Mean : 5.641157877225087, Standard deviation : 3.2699479960774847
Mean : 5.59967585089141, Standard deviation : 3.1784117658979216


##### Creating the concatenated 'Feature_stack'

In [76]:
X_train = sparse.hstack((
                         train_title_bow, #0.97
                         train_loc_one_hot, #0.25
                         train_fuel_one_hot, #0.32
                         train_transmission_one_hot, #0.59
                         train_owner_one_hot, #0.1
                         train_seats_one_hot, #0.31
    
    train_age_standardized, #0.3
    train_kms_standardized, #0.18
    train_mileage_standardized, #0.33
    train_engine_standardized, #0.66
    train_power_standardized #0.77
                        ))
#X_train = sparse.hstack([X_train,train[['age','mileage_amount','engine_amount','power_amount','log_kms','Seats']]])
#X_train = X_train.toarray()
print('X train shape',X_train.shape)

y_train = np.array(competition_train['Price'])
print('y train shape',y_train.shape)

#######################################################################################

X_test = sparse.hstack((
                         test_title_bow,
                         test_loc_one_hot,
                         test_fuel_one_hot,
                         test_transmission_one_hot,
                         test_owner_one_hot,
                         test_seats_one_hot,
    
                         test_age_standardized,
                         test_kms_standardized,
                         test_mileage_standardized,
                         test_engine_standardized,
                         test_power_standardized
                        ))
#X_train = sparse.hstack([X_train,train[['age','mileage_amount','engine_amount','power_amount','log_kms','Seats']]])
#X_train = X_train.toarray()
print('X test shape',X_test.shape)

#y_test = np.array(df_test['Price'])
#print('y test shape',y_test.shape)

X train shape (6011, 7373)
y train shape (6011,)
X test shape (1234, 7373)


In [77]:
'''
with open('competition_train_enhanced.pickle', 'wb') as f:
    pickle.dump(X_train, f)

with open('competition_test_enhanced.pickle', 'wb') as f:
    pickle.dump(X_test, f)

with open('competition_y_train_enhanced.pickle', 'wb') as f:
    pickle.dump(y_train, f)
'''


#infile = open('df_enhanced.pickle','rb')
#df = pickle.load(infile)
#infile.close()

# <a id = 'section12'>12. Train Model on Entire Competition Train Dataset</a>

In [None]:
reg_model = XGBRegressor()

parameters = {'max_depth' : [2,3,5,10],
              'subsample' : [0.5,0.75,0.9,1],
              'colsample_bytree' : [0.5,0.75,0.9,1],
              'lambda' : [0.01,0.1,1,10,100],
             'n_estimators' : [200]}

gs_reg = GridSearchCV(reg_model, 
                      parameters,
                      cv=2,
                      verbose = 100, 
                      scoring  = rmse_scorer,
                      n_jobs = -2)


######################################################################
gs_reg.fit(X_train, y_train)
print("Best score ",-gs_reg.best_score_)
print("Best params  ",gs_reg.best_params_)


with open('1_enhanced_XGBreg.pickle', 'wb') as f:
    pickle.dump(gs_reg, f)

# <a id = 'section13'>13. Predict On Competition Test and Output the Submission File</a>

In [82]:
my_model = gs_reg.best_estimator_
y_competition_predicted = my_model.predict(X_test)
y_competition_predicted = np.around(y_competition_predicted,decimals = 2)
#print("Test RMSE is ",root_mean_squared_error(y_test, y_competition_predicted))

In [83]:
submissions = pd.DataFrame({'Price' : y_competition_predicted})
submissions.to_csv('competition_submission_enhanced_1.csv',index = False)