In [2]:
import math              as mt
import numpy             as np
import matplotlib.pyplot as plt
import pandas            as pd
import seaborn           as sns
import plotly.graph_objs as go

from sklearn               import preprocessing
from sklearn.ensemble      import RandomForestClassifier
from sklearn.datasets      import make_classification
from sklearn.decomposition import PCA
from plotly.offline        import init_notebook_mode, iplot

In [7]:
init_notebook_mode(connected=True)

In [3]:
df_train = pd.read_csv('train.csv', index_col='Id')
df_train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [4]:
df_test = pd.read_csv('test.csv', index_col='Id')
df_test.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [5]:
indexes_null = df_train[df_train['LotFrontage'].isnull()].index.tolist()
df_train.loc[indexes_null, 'LotFrontage'] = df_train.loc[indexes_null, 'LotArea'].apply(lambda item: np.sqrt(item))

In [8]:
gardict = {
    'CarPort': 1,
    'Detchd' : 2,
    'Basment': 3,
    '2Types' : 4,
    'Attchd' : 5,
    'BuiltIn': 6
}
df_train['GarageType'].fillna(0, inplace=True)
df_train['GarageType'].replace(gardict, inplace=True)

In [13]:
garfindict = {
    'Unf': 1,
    'RFn': 2,
    'Fin': 3
}
df_train['GarageFinish'].fillna(0, inplace=True)
df_train['GarageFinish'].replace(garfindict, inplace=True)

TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'str'

In [14]:
garqualdict = {np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
df_train['GarageQual'].fillna(0, inplace=True)
df_train['GarageQual'].replace(garqualdict, inplace=True)

In [15]:
garcondict = {np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
df_train['GarageCond'].fillna(0, inplace=True)
df_train['GarageCond'].replace(garcondict, inplace=True)

In [16]:
gdict = {np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
df_train['PoolQC'].fillna(0, inplace=True)
df_train['PoolQC'].replace(gdict, inplace=True)

In [17]:
gdict = {np.nan:5, 'GdPrv':4, 'MnPrv':3, 'GdWo':2, 'MnWw':1, 'Ex':5}
df_train['Fence'].replace(gdict, inplace=True)

In [19]:
gdict = {
    np.nan: 2,
    'Elev': 5,
    'Gar2': 4,
    'TenC': 6,
    'Shed': 3,
    'Othr': 1}
df_train['MiscFeature'].replace(gdict, inplace=True)

In [20]:
#df_train['LotFrontage'].fillna(0, inplace=True)
df_train['Alley'].fillna('', inplace=True)
df_train['MasVnrType'].fillna('', inplace=True)
df_train['MasVnrArea'].fillna(0, inplace=True)
df_train['BsmtQual'].fillna(0, inplace=True)
df_train['BsmtCond'].fillna(0, inplace=True)
df_train['BsmtExposure'].fillna(0, inplace=True)
df_train['BsmtFinType1'].fillna(0, inplace=True)
df_train['BsmtFinType2'].fillna(0, inplace=True)
df_train['Electrical'].fillna('SBrkr', inplace=True)
df_train['FireplaceQu'].fillna(0, inplace=True)
#df_train['GarageYrBlt'].fillna(df_train['YearBuilt'], inplace=True)
#df_train['GarageFinish'].fillna('', inplace=True)
#df_train['GarageQual'].fillna('', inplace=True)
#df_train['GarageCond'].fillna('', inplace=True)
#df_train['PoolQC'].fillna('', inplace=True)
#df_train['Fence'].fillna('', inplace=True)
#df_train['MiscFeature'].fillna('', inplace=True)

# df_test['MSZoning'].fillna('', inplace=True)
# df_test['LotFrontage'].fillna(0, inplace=True)
# df_test['Alley'].fillna('', inplace=True)
# df_test['Utilities'].fillna('', inplace=True)
# df_test['Exterior1st'].fillna('', inplace=True)
# df_test['Exterior2nd'].fillna('', inplace=True)
# df_test['MasVnrType'].fillna('', inplace=True)
# df_test['MasVnrArea'].fillna(0, inplace=True)
# df_test['BsmtQual'].fillna('', inplace=True)
# df_test['BsmtCond'].fillna('', inplace=True)
# df_test['BsmtExposure'].fillna('', inplace=True)
# df_test['BsmtFinType1'].fillna('', inplace=True)
# df_test['BsmtFinSF1'].fillna(0, inplace=True)
# df_test['BsmtFinType2'].fillna('', inplace=True)
# df_test['BsmtFinSF2'].fillna(0, inplace=True)
# df_test['BsmtUnfSF'].fillna(0, inplace=True)
# df_test['TotalBsmtSF'].fillna(0, inplace=True)
# df_test['BsmtFullBath'].fillna(0, inplace=True)
# df_test['BsmtHalfBath'].fillna(0, inplace=True)
# df_test['KitchenQual'].fillna('', inplace=True)
# df_test['Functional'].fillna('Typ', inplace=True)
# df_test['FireplaceQu'].fillna('', inplace=True)
# df_test['GarageType'].fillna('', inplace=True)
# df_test['GarageYrBlt'].fillna(df_test['YearBuilt'], inplace=True)
# df_test['GarageFinish'].fillna('', inplace=True)
# df_test['GarageCars'].fillna(0, inplace=True)
# df_test['GarageArea'].fillna(0, inplace=True)
# df_test['GarageQual'].fillna('', inplace=True)
# df_test['GarageCond'].fillna('', inplace=True)
# df_test['PoolQC'].fillna('', inplace=True)
# df_test['Fence'].fillna('', inplace=True)
# df_test['MiscFeature'].fillna('', inplace=True)
# df_test['SaleType'].fillna('', inplace=True)

In [6]:
df_train['E'].unique()

array([nan, 'Grvl', 'Pave'], dtype=object)

In [None]:
# teste de nones nas colunas
for column in df_test.columns:
    if not (len(df_test[column])==len(df_test[column].dropna())):
        print(column)

In [None]:
df_train.corr()['SalePrice'].sort_values(ascending=False)

In [None]:
categorical = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
               'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
               'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
               'RoofStyle']

In [None]:
dict_encoded_train = {}
for column in df_train.columns:
    try: 
        le = preprocessing.LabelEncoder()
        le.fit(df_train[column])
    
        dict_encoded_train.update({column: le.transform(df_train[column])})
    except: print(column)

In [None]:
dict_encoded_test = {}
for column in df_test.columns:
    try: 
        le = preprocessing.LabelEncoder()
        le.fit(df_test[column])
    
        dict_encoded_test.update({column: le.transform(df_test[column])})
    except: print(column)

In [None]:
for index, row in df_test[['BsmtUnfSF', 'FireplaceQu']].iterrows():
    if row['BsmtUnfSF'] == '': df_test.loc[index, 'BsmtUnfSF'] = 0
    try: df_test.loc[index, 'BsmtUnfSF'] = int(row['BsmtUnfSF'])
    except: print(index)

In [None]:
df_test['FireplaceQu'].unique()

In [None]:
dict_encoded_test = {}
for column in df_test.columns:
    #try: 
    le = preprocessing.LabelEncoder()
    le.fit(df_test[column])

    dict_encoded_test.update({column: le.transform(df_test[column])})
    #except: print(column)

In [None]:
pd.DataFrame(dict_encoded_train).columns

In [None]:
X_train = pd.DataFrame(dict_encoded_train)[['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr',
       'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath',
       'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2',
       'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'Exterior1st',
       'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces', 'Foundation',
       'FullBath', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond',
       'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'GrLivArea',
       'HalfBath', 'Heating', 'HeatingQC', 'HouseStyle', 'KitchenAbvGr',
       'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotConfig',
       'LotFrontage', 'LotShape', 'LowQualFinSF', 'MSSubClass', 'MSZoning',
       'MasVnrArea', 'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold',
       'Neighborhood', 'OpenPorchSF', 'OverallCond', 'OverallQual',
       'PavedDrive', 'PoolArea', 'PoolQC', 'RoofMatl', 'RoofStyle',
       'SaleCondition', 'SaleType', 'ScreenPorch', 'Street',
       'TotRmsAbvGrd', 'TotalBsmtSF', 'Utilities', 'WoodDeckSF', 'YearBuilt',
       'YearRemodAdd', 'YrSold']]

In [None]:
X_test = pd.DataFrame(dict_encoded_test)[['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr',
       'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath',
       'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2',
       'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'Exterior1st',
       'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces', 'Foundation',
       'FullBath', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond',
       'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'GrLivArea',
       'HalfBath', 'Heating', 'HeatingQC', 'HouseStyle', 'KitchenAbvGr',
       'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotConfig',
       'LotFrontage', 'LotShape', 'LowQualFinSF', 'MSSubClass', 'MSZoning',
       'MasVnrArea', 'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold',
       'Neighborhood', 'OpenPorchSF', 'OverallCond', 'OverallQual',
       'PavedDrive', 'PoolArea', 'PoolQC', 'RoofMatl', 'RoofStyle',
       'SaleCondition', 'SaleType', 'ScreenPorch', 'Street',
       'TotRmsAbvGrd', 'TotalBsmtSF', 'Utilities', 'WoodDeckSF', 'YearBuilt',
       'YearRemodAdd', 'YrSold']]

In [9]:
X = X_train
y = df_train['SalePrice'].ravel()

clf = RandomForestClassifier(n_jobs=-1, random_state=22)
clf.fit(X, y)

print(clf.feature_importances_)

NameError: name 'X_train' is not defined

In [None]:
X = X_test

pd.DataFrame(clf.predict(X)).set_index(df_test.index).rename(columns={0:'SalePrice'}).to_csv('second_submission.csv')