In [180]:
# Getting files directions
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [181]:
# Data analysis and wrangling
import random as rnd
import numpy as np
import pandas as pd

#Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Missing values imputer
from sklearn.impute import SimpleImputer

# Machine learning tools



import warnings
warnings.filterwarnings("ignore")

In [182]:
# Loading the training data
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train_df.head(10)

In [183]:
# Shape of the training dataset
train_df.shape

In [184]:
# Loading the test data
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test_df.head(10)

In [185]:
# Shape of the testing dataset
test_df.shape

In [186]:
# Getting information about datasets
train_df.info()
print('\n'+ '='*50 + '\n')
test_df.info()

In [187]:
# Counting missing values in columns
print('Traing data:')
miss_val_td = train_df.isnull().sum()
print(miss_val_td[miss_val_td > 0])

print('='*20)

print('Testing data:')
miss_val_tsd = test_df.isnull().sum()
print(miss_val_tsd[miss_val_tsd > 0])

In [188]:
# Describing training dataset with descriptive statistics
train_df.describe()

In [189]:
# Describing categorical features of training data
train_df.describe(include=['O'])

In [190]:
numeric = [i for i in train_df.columns if train_df[i].dtype in ['int64','float64']]
numeric_data = train_df[numeric]
numeric_data.shape

In [191]:
fig, axes = plt.subplots(nrows=9, ncols=4, figsize=(20, 35))
for idx, feature in enumerate(numeric_data.columns[1:-1]):
    numeric_data.plot(feature, "SalePrice", subplots=True, kind="scatter", ax=axes[idx//4, idx%4])

In [192]:
corr = numeric_data[numeric_data.columns].corr()['SalePrice'][:-1].sort_values(ascending=True).to_frame()

# Visualization
fig, ax = plt.subplots(figsize =(9, 9))
ax.barh(corr.index, corr.SalePrice, align='center', color = np.where(corr['SalePrice'] < 0, 'crimson', '#89CFF0'))
ax.grid(color='grey', linewidth=1)

# Remove axes spines
for i in ['top', 'bottom', 'left', 'right']:
    ax.spines[i].set_visible(False)

In [193]:
sns.scatterplot(data=train_df, x='GrLivArea', y='SalePrice')
plt.axhline(y=200000, color='r')
plt.axvline(x=4000, color='r')

In [194]:
sns.scatterplot(data=train_df, x='OverallQual', y='SalePrice')
plt.axhline(y=200000, color='r')

In [195]:
sns.scatterplot(data=train_df, x='GarageCars', y='SalePrice')
plt.axhline(y=700000, color='r')

In [196]:
drop_index_one = train_df[(train_df['SalePrice'] > 700000)].index
drop_index_two = train_df[(train_df['OverallQual'] > 8) & (train_df['SalePrice'] < 200000)].index
drop_index_one, drop_index_two

In [197]:
to_drop = [drop_index_one[0], drop_index_one[1], drop_index_two[0], drop_index_two[1]]
to_drop

In [198]:
train_df = train_df.drop(to_drop, axis=0)

In [199]:
# Do we have duplicates?
print('Number of Duplicates in tain data:', len(train_df[train_df.duplicated()]))
print('Number of Duplicates in test data:', len(test_df[test_df.duplicated()]))

# <center>Handling Missing Data<center>

In [200]:
print('Traing data:')
miss_val_td = train_df.isnull().sum().sort_values(ascending=False)
print(miss_val_td[miss_val_td > 0])

print('='*20)

print('Testing data:')
miss_val_tsd = test_df.isnull().sum().sort_values(ascending=False)
print(miss_val_tsd[miss_val_tsd > 0])

print('='*20)
print(len(miss_val_td[miss_val_td > 0].to_frame().T.columns) == len(miss_val_tsd[miss_val_tsd > 0].to_frame().T.columns))

In [201]:
none_columns = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageQual', 'GarageFinish',
               'GarageType','BsmtCond', 'BsmtExposure', 'BsmtQual', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']

In [202]:
for column in none_columns:
    train_df.loc[train_df[column].isnull(), column] = 'None'
    test_df.loc[test_df[column].isnull(), column] = 'None'

In [203]:
print('Traing data:')
miss_val_td = train_df.isnull().sum().sort_values(ascending=False)
print(miss_val_td[miss_val_td > 0])

print('='*20)

print('Testing data:')
miss_val_tsd = test_df.isnull().sum().sort_values(ascending=False)
print(miss_val_tsd[miss_val_tsd > 0])

In [204]:
mode_columns = ['Electrical', 'LotFrontage', 'GarageYrBlt', 'MasVnrArea', 'MSZoning', 'BsmtFullBath', 'BsmtHalfBath',
               'Utilities', 'BsmtFinSF1', 'Exterior1st', 'KitchenQual', 'GarageArea', 'GarageCars', 'Exterior2nd',
               'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'SaleType', 'Functional']

In [205]:
for column in mode_columns:
    train_df.loc[train_df[column].isnull(), column] = train_df[column].mode()[0]
    test_df.loc[test_df[column].isnull(), column] = test_df[column].mode()[0]

In [206]:
print('Traing data:')
miss_val_td = train_df.isnull().sum().sort_values(ascending=False)
print(miss_val_td[miss_val_td > 0])

print('='*20)

print('Testing data:')
miss_val_tsd = test_df.isnull().sum().sort_values(ascending=False)
print(miss_val_tsd[miss_val_tsd > 0])

# <center>Feature engineering<center>

In [207]:
categorical = [i for i in train_df.columns if train_df[i].dtype not in ['int64','float64']]
print(categorical)

In [208]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

train_encoded = train_df[categorical].apply(encoder.fit_transform)
test_encoded = test_df[categorical].apply(encoder.fit_transform)

In [209]:
train_encoded.head()

In [210]:
train_data = train_df[numeric].join(train_encoded)
test_data = test_df[numeric[:-1]].join(test_encoded)

train_data.head()

In [211]:
test_data.head()

In [212]:
cat_features = [
    'MSZoning', 'Utilities', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'ExterQual', 'Heating', 
    'HeatingQC', 'CentralAir', 'Functional', 'GarageType', 'GarageQual', 'PavedDrive', 'SaleType', 'SaleCondition'
]

In [213]:
import itertools
interactions = pd.DataFrame(index=train_df.index)

for col1, col2 in itertools.combinations(cat_features, 2):
    new_col_name = '_'.join([col1, col2])
    new_values = train_df[col1].map(str) + '_' + train_df[col2].map(str)
    encoder = LabelEncoder()
    interactions[new_col_name] = encoder.fit_transform(new_values)

In [214]:
interactions.head()

In [215]:
train_data = train_data.join(interactions)
test_data = test_data.join(interactions)

train_data.head()

In [216]:
from sklearn.utils import shuffle

X_full = train_data.copy()
X_full = X_full.drop(['Id'], axis = 1)
X_full = shuffle(X_full, random_state=123)
y_full = X_full['SalePrice']
X_full = X_full.drop(['SalePrice'], axis = 1)

In [217]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_full, y_full, train_size=0.7, test_size=0.3)

In [240]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

logistic = LogisticRegression(C=0.06, penalty='l1', solver='liblinear', random_state=7).fit(X_train, y_train)
model = SelectFromModel(logistic, prefit=True)
X_new = model.transform(X_train)

selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X_train.index, columns=X_train.columns)
selected_columns = selected_features.columns[selected_features.var() != 0]
cols_to_drop = selected_features.columns[selected_features.var() == 0]

In [241]:
selected_columns

In [242]:
print(len(selected_columns))

In [245]:
# from sklearn.feature_selection import SelectKBest, f_classif

# selector = SelectKBest(f_classif, k=20)
# new = selector.fit_transform(X_train, y_train)

# selected_feat = pd.DataFrame(selector.inverse_transform(new), index=X_train.index, columns=X_train.columns)
# selected_cols = selected_feat.columns[selected_feat.var() != 0]
# cols_to_drop = selected_features.columns[selected_features.var() == 0]

# print(selected_cols)

In [246]:
X_train = X_train.drop(cols_to_drop, axis=1)
X_valid = X_valid.drop(cols_to_drop, axis=1)
test_data = test_data.drop(cols_to_drop, axis=1)

In [248]:
import xgboost as xg
from sklearn.metrics import mean_squared_error, mean_absolute_error

xg_reg = xg.XGBRegressor(n_estimators = 1000, learning_rate=0.05)
# xg_reg = xg.XGBRegressor(objective ='reg:linear', n_estimators = 500, seed = 123)

xg_reg.fit(X_train, y_train)
pred = xg_reg.predict(X_valid)
mean = mean_absolute_error(pred, y_valid)
mean

In [225]:
ids = test_data['Id']
test_data = test_data.drop(['Id'], axis=1)
result = xg_reg.predict(test_data)
submission = pd.DataFrame({
        "Id": ids,
        "SalePrice": result
    })
submission.to_csv('submission.csv', index=False)

# LATEST SCORE
# 0.19593 