In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
train = pd.read_csv("train.csv", header = 0)
test = pd.read_csv("test.csv", header = 0)
labels = train.SalePrice

In [3]:
train_temp = train["SalePrice"].copy()

In [4]:
train.drop(["Id", "SalePrice"], axis=1, inplace=True)
test.drop(["Id"], axis=1, inplace=True)

In [5]:
print("{0:15}\t{1}\t{2}".format("Name", "NaNs", "#unique"))
for col in train.columns:
    print("{0:15}\t{1:.4f}\t{2}".format(col, train[col].isnull().mean(), train[col].nunique()))

Name           	NaNs	#unique
MSSubClass     	0.0000	15
MSZoning       	0.0000	5
LotFrontage    	0.1774	110
LotArea        	0.0000	1073
Street         	0.0000	2
Alley          	0.9377	2
LotShape       	0.0000	4
LandContour    	0.0000	4
Utilities      	0.0000	2
LotConfig      	0.0000	5
LandSlope      	0.0000	3
Neighborhood   	0.0000	25
Condition1     	0.0000	9
Condition2     	0.0000	8
BldgType       	0.0000	5
HouseStyle     	0.0000	8
OverallQual    	0.0000	10
OverallCond    	0.0000	9
YearBuilt      	0.0000	112
YearRemodAdd   	0.0000	61
RoofStyle      	0.0000	6
RoofMatl       	0.0000	8
Exterior1st    	0.0000	15
Exterior2nd    	0.0000	16
MasVnrType     	0.0055	4
MasVnrArea     	0.0055	327
ExterQual      	0.0000	4
ExterCond      	0.0000	5
Foundation     	0.0000	6
BsmtQual       	0.0253	4
BsmtCond       	0.0253	4
BsmtExposure   	0.0260	4
BsmtFinType1   	0.0253	6
BsmtFinSF1     	0.0000	637
BsmtFinType2   	0.0260	6
BsmtFinSF2     	0.0000	144
BsmtUnfSF      	0.0000	780
TotalBsmtSF    	0.0000	72

Здесь представлены, как количественные (площадь участка, кол-во комнат, площадь этажей и т.п.), так и качественные (класс здания, год постройки, форма участка, материал крыши и т.п.) признаки. Также можем выделить набор признаков для описания гаража, набор для описания подвала, для жилой части здания и т.п.

In [6]:
all = pd.concat([train, test], axis=0)

In [7]:
to_drop = ["Street","Utilities","MiscFeature"]
all.drop(to_drop, axis=1, inplace=True, errors="ignore")

to_replace_to_none = ["Fence", "FireplaceQu", "Alley","MasVnrType","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1"
                     ,"BsmtFinType2","Electrical","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC"]
for col in to_replace_to_none:
    all[col].fillna("No" + col, inplace=True)
    
#to_replace_to_zero = ["BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF","TotalBsmtSF","BsmtHalfBath"
#                     ,"MasVnrArea","GarageArea","BsmtFullBath"]
#for col in to_replace_to_zero:
#    all[col].fillna(0, inplace=True)
    
#to_replace_to_avg = ["LotFrontage"]
#for col in to_replace_to_avg:
#    all[col].fillna(all[col].mean(), inplace=True)
    
all["KitchenQual"].fillna("TA", inplace=True)
    
all.fillna(0, inplace=True)

In [8]:
def binarize(data, columns):
    """
    binarize feature
    
    data: pd.csv dataset
    columns: list of cstegorical columns to process
    """
    binarized_data = data.copy()
    for column in columns:
        unique_items = set(data[column])
        for unique_item in unique_items:
            new_column = []
            for item in data[column]:
                new_column.append(int(item==unique_item))
            binarized_data[column+'_'+str(unique_item)] = new_column
        binarized_data.drop(column, axis=1, inplace=True)
    return binarized_data

In [9]:
all[['LotArea', 'LotFrontage', 'YearBuilt']].describe()

Unnamed: 0,LotArea,LotFrontage,YearBuilt
count,2919.0,2919.0,2919.0
mean,10168.11408,57.766701,1971.312778
std,7886.996359,33.481636,30.291442
min,1300.0,0.0,1872.0
25%,7478.0,43.0,1953.5
50%,9453.0,63.0,1973.0
75%,11570.0,78.0,2001.0
max,215245.0,313.0,2010.0


In [10]:
all = binarize(all, ["LandContour","LotConfig","LotShape","LandSlope","Neighborhood","BldgType","HouseStyle", "ExterQual"
                     ,"CentralAir", "SaleCondition", "PavedDrive", "KitchenQual"
                    ])

In [11]:
categorial = ["Condition1", "Condition2", "FireplaceQu","Exterior1st", "Exterior2nd", "RoofMatl","Alley"
              , "MasVnrType","ExterCond","RoofStyle", "Electrical", "Heating","SaleType", "HeatingQC"
                  , "Foundation", "BsmtQual", "BsmtCond", "BsmtFinType1", "BsmtFinType2","MSZoning"
                   , "Functional", "GarageFinish", "BsmtExposure","GarageType", "Fence"
                    , "GarageQual", "GarageCond"]
for col in categorial:
    all[col] = all[col].factorize()[0]

In [12]:
to_drop = ['GarageArea', 'GarageYrBlt', "PoolQC", "MSSubClass","MoSold"
          , "Electrical", "Heating", "Functional", "FireplaceQu"
          ,"SaleType","ScreenPorch","YearRemodAdd","BsmtFullBath","BsmtHalfBath"
          ,"BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","1stFlrSF","2ndFlrSF","LowQualFinSF"
          , "GarageCond", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"
          #,"BsmtHalfBath"
          ]
all = all.drop(to_drop, axis=1)

In [13]:
buck = ["YearBuilt"#,"GrLivArea","TotalBsmtSF"        ,"LotArea","LotFrontage"
       ]
coef = [[1850,1940,1960,1980,1990,2000,2005,2100],
        #[100,500,1000,1500,2000,10000],
       # [100,500,1000,1500,10000],
        #[1000,3000,5000,7000,9000,11000,13000,15000,17000,20000,20000000],
       # [20,50,100,10000]
       ]

In [14]:
for x in range(len(buck)):
    bucketed = np.array(all[buck[x]])
    for y in range(len(coef[x])-1):
        bucketed[(bucketed >= coef[x][y]) & (bucketed < coef[x][y+1])] = coef[x][y]
    all[buck[x]] = bucketed

In [15]:
to_flat = ["TotRmsAbvGrd","HalfBath","BedroomAbvGr","GarageCars","PoolArea"]#,"YearBuilt","GrLivArea","LotArea","LotFrontage"]
    #,"BsmtFullBath","BsmtHalfBath"]
coef = [10,1,4,3,100]#,2015,2500,20000,100]
for x in range(len(to_flat)):
    flatten = np.array(all[to_flat[x]]) 
    flatten[flatten >= coef[x]] = coef[x]
    all[to_flat[x]] = flatten

e = pd.Series(train_temp)
train_temp = all.iloc[:train.shape[0]].astype('float').assign(e=e.values)
corr_matrix1 = train_temp.corr().abs()
upper1 = corr_matrix1.where(np.triu(np.ones(corr_matrix1.shape), k=1).astype(np.bool))
upper1["e"]

s1 = upper1["SalePrice"]
all = all.drop(s1[s1<0.05].index, axis=1)

In [16]:
corr_matrix = all.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

In [17]:
train = all.iloc[:train.shape[0]].astype('float')
test = all.iloc[train.shape[0]:].astype('float')

Linear Regression

In [18]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [19]:
def submit(preds):
    submission = pd.read_csv("sample_submission.csv")
    submission["SalePrice"] = preds
    submission.to_csv("submission.csv", index=False)

In [20]:
regr = LinearRegression(normalize=True, n_jobs=1)
regr.fit(train, np.log1p(labels))
preds = regr.predict(test)
submit(np.expm1(preds))

In [21]:
print('Coefficients: \n', regr.coef_)

Coefficients: 
 [-2.64508813e-04 -1.63007941e-04  1.46539079e-06  1.32922297e-02
 -4.89144374e-03 -3.31155310e-02  5.85306649e-02  4.16098894e-02
  6.26686096e-04  2.98960418e-02 -4.40885787e-02  1.87262110e-03
 -2.39550491e-03 -2.66319293e-03 -2.47209131e-05 -1.14439146e-02
 -1.84015441e-02  2.90656058e-03 -1.07946354e-02  4.29410738e-03
 -1.21292463e-02  4.94262209e-03  6.39229487e-05 -1.30437066e-02
  2.16825111e-04  3.80906339e-02  4.29824087e-02 -2.42823727e-03
 -9.65114754e-02  1.28355053e-02  3.29908935e-02 -1.25095938e-02
  5.25871029e-03  5.91537001e-02  4.21060744e-03 -1.00582251e-03
 -2.31304529e-04  6.39778624e-07 -2.92602950e-03 -5.84209312e+10
 -5.84209312e+10 -5.84209313e+10 -5.84209312e+10  1.08013790e+11
  1.08013790e+11  1.08013790e+11  1.08013790e+11  1.08013790e+11
  3.09499366e+10  3.09499366e+10  3.09499366e+10  3.09499366e+10
 -1.40276878e+10 -1.40276878e+10 -1.40276878e+10  3.91887217e+10
  3.91887217e+10  3.91887217e+10  3.91887217e+10  3.91887217e+10
  3.91887

In [22]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=7, random_state=0)

In [23]:
from sklearn.model_selection import cross_val_score
t = cross_val_score(regr, train, np.log1p(labels), scoring="mean_squared_error", cv=cv).mean()

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [24]:
print(int(100000*t))

-2153


Лучший результат на Kaggle для линейной регрессии - 0.13284. Но из-за постоянных экспериментов я потерял те параметры, с которыми этот результат был достигнут. Текущие параметры дают результат чуть хуже.
https://www.kaggle.com/pavelkuleshov


DecisionTreeRegressor

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt

In [None]:
regr_1 = DecisionTreeRegressor(max_depth=3, random_state=10)
regr_2 = DecisionTreeRegressor(max_depth=5, random_state=10)
regr_1.fit(train, np.log1p(labels))
regr_2.fit(train, np.log1p(labels))

In [None]:
print(cross_val_score(regr_1, train, np.log1p(labels), scoring="mean_squared_error", cv=cv).mean())
print(cross_val_score(regr_2, train, np.log1p(labels), scoring="mean_squared_error", cv=cv).mean())

In [None]:
preds = regr_2.predict(test)
submit(np.expm1(preds))

RandomForest

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(n_estimators=100)

In [None]:
rf.fit(train, labels)

importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(20, 8))
plt.title("Feature importances")
plt.bar(range(train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(train.shape[1]), train.columns[indices])
plt.xlim([-1, train.shape[1]])
plt.show()

In [None]:
for feature in zip(train.columns, rf.feature_importances_):
    print(feature)

In [None]:
rr = np.array(train.columns)
rrr = np.stack((rf.feature_importances_,rr), axis=-1)
rrr = np.sort(rrr, axis=0,)

In [None]:
dr = []
for i in range(50):
    dr.append(rrr[len(rrr)-1-i,1])

In [None]:
train = train[dr]
test = test[dr]

In [None]:
from sklearn.grid_search import GridSearchCV

rf = RandomForestClassifier(n_estimators=100, bootstrap=True)
params = {
    'n_estimators': [50, 100, 500]
}
gsv = GridSearchCV(estimator=rf, param_grid=params, scoring='accuracy', cv=3, verbose=1)
gsv.fit(train, labels)

In [None]:
print(gsv.best_params_, gsv.best_score_)

In [None]:
preds = gsv.predict(test)
submit(preds)

In [None]:
gsv.score(train,labels)

In [None]:
cross_val_score(gsv, train, labels, scoring='accuracy')

GradientBoostingRegressor

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

In [None]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(train, labels)
print(cross_val_score(clf, train, np.log1p(labels), scoring="mean_squared_error", cv=cv).mean())

In [None]:
preds = clf.predict(test)
submit(preds)

In [None]:
# Plot feature importance
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, train.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
rr = np.array(train.columns[sorted_idx])

In [None]:
dr = []
for i in range(50):
    dr.append(rr[len(rr)-1-i])

In [None]:
train = train[dr]
test = test[dr]