In [258]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAE
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest

In [259]:
# Read the data
X = pd.read_csv('train.csv', index_col='Id') 
X_test = pd.read_csv('test.csv', index_col='Id')

print('The shape of X ', X.shape)

print('The shape of X_test ', X_test.shape)

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)


# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 

X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)



The shape of X  (1460, 80)
The shape of X_test  (1459, 79)


In [260]:
good_label_cols = [col for col in X if 
                   set(X[col]) == set(X_test[col])]

low_cardinality_cols = [col for col in good_label_cols if X[col].nunique() < 10]

X = pd.DataFrame(X[low_cardinality_cols], columns=low_cardinality_cols)
X_test = pd.DataFrame(X_test[low_cardinality_cols], columns=low_cardinality_cols)

print('The shape of X ', X.shape)

print('The shape of X_test ', X_test.shape)


The shape of X  (1460, 18)
The shape of X_test  (1459, 18)


In [268]:
good_label_cols = [col for col in X if 
                   set(X[col]) == set(X_test[col])]


low_cardinality_cols = [col for col in good_label_cols if X[col].nunique() < 10]


X = pd.DataFrame(X[low_cardinality_cols], columns=low_cardinality_cols)
X_test = pd.DataFrame(X_test[low_cardinality_cols], columns=low_cardinality_cols)


print('The shape of X ', X.shape)
print('The shape of X_test ', X_test.shape)


OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)


OH_col_X = pd.DataFrame(OH_encoder.fit_transform(X[low_cardinality_cols]))
OH_col_X_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))


num_X = X.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)



print('The shape of num_X ', num_X.shape)
print('The shape of num_X_test ', num_X_test.shape)


OH_X_train = pd.concat([OH_col_X, num_X], axis=1) # Your code here
OH_X_valid = pd.concat([OH_col_X_test, num_X_test], axis=1) # Your code here

OH_X_train.dropna(inplace=True, axis=0)
OH_X_valid.dropna(inplace=True, axis=0)


print('The shape of OH_X_train ', OH_X_train.shape)
print('The shape of OH_X_valid ', OH_X_valid.shape)



OH_X_train.to_csv("X_numerical.csv", header = True, index = False)
OH_X_valid.to_csv("X_numerical_test.csv", header = True, index = False)


print('The shape of OH_X_train ', OH_X_train.shape)
print('The shape of OH_X_valid ', OH_X_valid.shape)

The shape of X  (1460, 91)
The shape of X_test  (1459, 91)
The shape of num_X  (1460, 0)
The shape of num_X_test  (1459, 0)
The shape of OH_X_train  (1460, 196)
The shape of OH_X_valid  (1459, 196)
The shape of OH_X_train  (1460, 196)
The shape of OH_X_valid  (1459, 196)


In [269]:
### Reading all the values

In [270]:
X1 = pd.read_csv('X_numerical_median.csv', index_col=None) 
X2 = pd.read_csv('X_numerical.csv', index_col=None)

X_test1 = pd.read_csv('X_numerical_test.csv', index_col = None)
X_test2 = pd.read_csv('X_numerical_test_median.csv', index_col=None)

X = pd.concat([X1, X2], axis=1)
df= df.reset_index(drop=True)
X_test = pd.concat([X_test1, X_test2], axis=1)

X_test.isna().any().sum()

0

In [271]:
X_train, X_test2, y_train, y_test2 = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [279]:

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

classifierName = DecisionTreeClassifier(criterion = 'gini', random_state = 1)
boost = AdaBoostClassifier(base_estimator=classifierName, n_estimators=100, learning_rate = 0.1, random_state= 1)

boost.fit(X_train, y_train)
preds_test = boost.predict(X_test)

In [280]:
samplesubmission = pd.read_csv('sample_submission.csv')




# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})

output = pd.DataFrame({'Id': samplesubmission.Id, 'SalePrice': preds_test})

output.to_csv('finalPredicton.csv', index=False)