In [527]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
import seaborn as sb
import random
import statsmodels.api as sm
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_selector as selector

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 9)
plt.style.use('ggplot')

In [528]:
houses_df = pd.read_csv('train.csv')

houses_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [529]:
houses_df = houses_df[houses_df["SalePrice"] < 700000]
houses_df = houses_df[houses_df["LotArea"] < 100000]

In [530]:
houses_df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1454.0,1454.0,1198.0,1454.0,1454.0,1454.0,1454.0,1454.0,1446.0,1454.0,...,1454.0,1454.0,1454.0,1454.0,1454.0,1454.0,1454.0,1454.0,1454.0,1454.0
mean,731.11967,56.856946,69.8798,10084.839752,6.093535,5.574966,1971.253095,1984.852132,102.737206,440.001376,...,93.530949,46.664374,22.044704,3.423659,15.123109,2.388583,42.843191,6.324622,2007.817056,179822.555708
std,421.854142,42.220998,24.044457,5642.853408,1.377551,1.11405,30.249466,20.658752,178.815004,452.716016,...,124.163458,66.351284,61.228893,29.376979,55.864001,37.560994,496.697866,2.705145,1.329774,76460.500441
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,367.25,20.0,59.0,7536.25,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129900.0
50%,732.5,50.0,69.0,9458.5,6.0,5.0,1973.0,1994.0,0.0,380.5,...,0.0,24.5,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,162900.0
75%,1095.75,70.0,80.0,11546.25,7.0,6.0,2000.75,2004.0,164.75,706.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,213430.0
max,1460.0,190.0,313.0,70761.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,625000.0


In [531]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(houses_df)
numerical_columns.remove('Id')
print(numerical_columns)
categorical_columns = categorical_columns_selector(houses_df)
print(categorical_columns)

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual

In [532]:
null_counts = houses_df.isnull().sum()
# print(null_counts)
null_counts = null_counts[null_counts > 0]
print(null_counts)

LotFrontage      256
Alley           1363
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1448
Fence           1174
MiscFeature     1402
dtype: int64


In [533]:
nulls = []
for i in null_counts.index:
    print(i)
    houses_df = houses_df.apply(pd.to_numeric, errors='coerce')
    houses_df = houses_df.fillna(0)

LotFrontage
Alley
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Electrical
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageQual
GarageCond
PoolQC
Fence
MiscFeature


In [534]:
houses_df = pd.read_csv('train.csv', encoding='latin1')

for i in null_counts.index:
    houses_df = houses_df.apply(pd.to_numeric, errors='coerce')
    houses_df = houses_df.fillna(0)

# Define the number of categories
num_categories = 3

category_names = ['Economica', 'Intermedia', 'Cara']

houses_df['Precio_Categoria'] = pd.qcut(houses_df['SalePrice'], q=num_categories, labels=category_names)
dummies = pd.get_dummies(houses_df['Precio_Categoria'])

houses_df = pd.concat([houses_df, dummies], axis=1)

print(houses_df[['Economica', 'Intermedia', 'Cara']])
print("Economica: ", houses_df['Economica'].value_counts()[1])
print("Intermedia: ", houses_df['Intermedia'].value_counts()[1])
print("Cara: ", houses_df['Cara'].value_counts()[1])

      Economica  Intermedia  Cara
0             0           0     1
1             0           1     0
2             0           0     1
3             0           1     0
4             0           0     1
...         ...         ...   ...
1455          0           1     0
1456          0           0     1
1457          0           0     1
1458          0           1     0
1459          0           1     0

[1460 rows x 3 columns]
Economica:  487
Intermedia:  490
Cara:  483


In [535]:
houses_df.describe()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Economica,Intermedia,Cara
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,0.0,57.623288,10516.828082,0.0,0.0,0.0,0.0,0.0,...,0.0,43.489041,6.321918,2007.815753,0.0,0.0,180921.19589,0.333562,0.335616,0.330822
std,421.610009,42.300571,0.0,34.664304,9981.264932,0.0,0.0,0.0,0.0,0.0,...,0.0,496.123024,2.703626,1.328095,0.0,0.0,79442.502883,0.471647,0.472367,0.47067
min,1.0,20.0,0.0,0.0,1300.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,2006.0,0.0,0.0,34900.0,0.0,0.0,0.0
25%,365.75,20.0,0.0,42.0,7553.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,2007.0,0.0,0.0,129975.0,0.0,0.0,0.0
50%,730.5,50.0,0.0,63.0,9478.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,6.0,2008.0,0.0,0.0,163000.0,0.0,0.0,0.0
75%,1095.25,70.0,0.0,79.0,11601.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,8.0,2009.0,0.0,0.0,214000.0,1.0,1.0,1.0
max,1460.0,190.0,0.0,313.0,215245.0,0.0,0.0,0.0,0.0,0.0,...,0.0,15500.0,12.0,2010.0,0.0,0.0,755000.0,1.0,1.0,1.0


In [536]:
# lista = {}
# for column in houses_df.columns:
#     zero_count = (houses_df[column] == 0).sum()
#     if zero_count > 1450:
#         lista[column] = zero_count

# for lis in lista:
#     print(lis, lista[lis])

# for lis in lista:
#     houses_df = houses_df.drop(lis, axis=1)

In [537]:
y = houses_df.pop("Cara")
X = houses_df
X.drop(["Economica", "Intermedia", "Precio_Categoria"], axis=1, inplace=True)
X_train, X_test,y_train, y_test = train_test_split(X[numerical_columns], y,test_size=0.3,train_size=0.7)

In [538]:
X.describe()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,0.0,57.623288,10516.828082,0.0,0.0,0.0,0.0,0.0,...,2.758904,0.0,0.0,0.0,43.489041,6.321918,2007.815753,0.0,0.0,180921.19589
std,421.610009,42.300571,0.0,34.664304,9981.264932,0.0,0.0,0.0,0.0,0.0,...,40.177307,0.0,0.0,0.0,496.123024,2.703626,1.328095,0.0,0.0,79442.502883
min,1.0,20.0,0.0,0.0,1300.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,0.0,0.0,34900.0
25%,365.75,20.0,0.0,42.0,7553.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,0.0,0.0,129975.0
50%,730.5,50.0,0.0,63.0,9478.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,0.0,0.0,163000.0
75%,1095.25,70.0,0.0,79.0,11601.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,0.0,0.0,214000.0
max,1460.0,190.0,0.0,313.0,215245.0,0.0,0.0,0.0,0.0,0.0,...,738.0,0.0,0.0,0.0,15500.0,12.0,2010.0,0.0,0.0,755000.0


In [539]:
logReg = LogisticRegression(solver='liblinear')
result = logReg.fit(X_train,y_train)

In [540]:
y_pred = logReg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logReg.score(X_test[numerical_columns], y_test)))

Accuracy of logistic regression classifier on test set: 0.99
