In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

# Read the data
X = pd.read_csv('D:/code/Data/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
X_test_full = pd.read_csv('D:/code/Data/house-prices-advanced-regression-techniques/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 100 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# One-hot encode the data
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)



#iterating through the hyperparameters
'''paramaters to iterate:
    n_estimators
    learning_rate
    early_stopping_rounds
    tree_depth
    
    '''



data=[]

for est in range(10000,10001,5000):
    for learn in np.arange(.004,.005,.001):
        for early in range(100,1001,100):
          
            my_model = XGBRegressor(objective ='reg:squarederror',n_estimators=est, learning_rate=learn, n_jobs=8)
            my_model.fit(X_train, y_train, early_stopping_rounds=early, eval_set=[(X_valid, y_valid)], verbose=False)
            predictions = my_model.predict(X_valid)
            mae=mean_absolute_error(predictions, y_valid)
            
            data.append({'estimators':est,'learning_rate':learn,'early_stopping_rounds':early,'mae':mae})
            print(f'{est} {learn} {early} {mae}')
            
hyper=pd.DataFrame(data)
print(hyper)
hyper_min=hyper[hyper.mae == hyper.mae.min()]
print(hyper_min)


hyper.to_csv(r'D:/code/Data/hyper1.csv',index=False, header =1)
hyper_min.to_csv(r'D:/code/Data/hyper_min1.csv',index=False, header =1)

#print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))


#preds_test = my_model.predict(X_test)

#output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})

#output.to_csv(r'C:/Users/evant/OneDrive/Desktop/C/Data/home-data-for-ml-course/kaggle_housing_attempt_1.csv',index=False, header =1)




10000 0.004 100 15553.786908711472
10000 0.004 200 15566.45756635274
10000 0.004 300 15566.45756635274
10000 0.004 400 15566.45756635274


KeyboardInterrupt: 