In [43]:
import pandas as pd
import os 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, model_selection
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from scipy.stats import chisquare
import csv
from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest

import datetime as dt

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

np.seterr(divide='ignore', invalid='ignore')

%matplotlib inline 


In [2]:
def fix_num_cols(dframe, skiplist, dropcol=1):
    n = 0
    m = 0
    df_tmp = dframe.copy()
    for col in dframe.columns.values:
        if col not in skiplist:
            if isinstance(df_tmp[col].dtype,((object))):
                try:
                    df_tmp[col].fillna(df_tmp[col].median(), inplace=True)
                except:
                    if dropcol:
                        df_tmp.drop(col, inplace=True, axis=1)
                    else:
                        df_tmp[col].fillna('NUL', inplace=True)
    return df_tmp

In [3]:
def fix_data(df_, skip):
    catlist = []
    rest = []
    df_temp = df_.copy()
    
    # create 2 new lists: one for categorical data and the other for numerical
    with open('cols.csv', 'r') as csvfile:
        file = csv.reader(csvfile, delimiter=' ')
        for row in file:
            cl = row[0].split(",")[1].replace('"','').replace("'",'').replace('(','')
            if cl not in skip:
                tp = row[0].split(",")[0]
                if tp == 'cat':
                    catlist.append(cl)
                else:
                    rest.append(cl)
    # cat data turned into dummy variables and old cat column dropped
    df_cat = df_temp[catlist].copy()           
    for col in catlist:
        df_cat = pd.merge(df_cat, pd.get_dummies(df_cat[col].astype('category').cat.codes, prefix=col)\
                             ,left_index=True, right_index=True)
        df_cat.drop([col],inplace=True, axis=1)
        
    # continous numeric data scaled ... 
    scaler = preprocessing.StandardScaler()
    df_ = fix_num_cols(df_temp[rest],skip)
    scaler.fit(df_)
    df_new = pd.merge(pd.DataFrame(scaler.transform(df_), columns=rest), \
                      df_cat,left_index=True, right_index=True)
    df_new['price_cat'] = df_temp['price_cat']
    
    return (df_new, rest)


In [4]:
km_list = ['additional_education_km', 'basketball_km', 'big_church_km', 'big_market_km', 'big_road1_km', \
           'big_road2_km', 'bulvar_ring_km', 'bus_terminal_avto_km', 'catering_km', 'cemetery_km', \
           'church_synagogue_km', 'detention_facility_km', 'exhibition_km', 'fitness_km', 'green_zone_km', \
           'hospice_morgue_km', 'ice_rink_km', 'incineration_km', 'industrial_km', 'kindergarten_km', 'kremlin_km', \
           'market_shop_km', 'metro_km_avto', 'metro_km_walk', 'mkad_km', 'mosque_km', 'museum_km',\
           'nuclear_reactor_km', 'office_km', 'oil_chemistry_km', 'park_km', 'power_transmission_line_km',\
           'preschool_km', 'public_healthcare_km', 'public_transport_station_km', 'radiation_km', 'railroad_km', \
           'railroad_station_avto_km', 'railroad_station_walk_km', 'sadovoe_km', 'school_km', 'shopping_centers_km', \
           'stadium_km', 'swim_pool_km', 'theater_km', 'thermal_power_plant_km', 'ts_km', 'ttk_km', \
           'university_km', 'water_km', 'water_treatment_km', 'workplaces_km', 'zd_vokzaly_avto_km']
len(km_list)

53

In [5]:
ig_list = ['ID_railroad_station_walk', 'additional_education_km', 'area_m', 'basketball_km', 'big_church_km', \
           'big_market_km', 'big_road1_km', 'big_road2_km', 'build_count_1946-1970', 'build_count_1971-1995',\
           'build_count_after_1995', 'build_count_block', 'build_count_brick', 'build_count_panel', 'bulvar_ring_km',\
           'bus_terminal_avto_km', 'cafe_avg_price_1000', 'cafe_avg_price_1500', 'cafe_avg_price_2000', \
           'cafe_avg_price_3000', 'cafe_avg_price_5000', 'cafe_sum_1000_min_price_avg',\
           'cafe_sum_1500_max_price_avg', 'cafe_sum_1500_min_price_avg', 'cafe_sum_2000_max_price_avg',\
           'cafe_sum_2000_min_price_avg', 'cafe_sum_3000_max_price_avg', 'cafe_sum_3000_min_price_avg',\
           'cafe_sum_5000_max_price_avg', 'cafe_sum_5000_min_price_avg', 'catering_km', 'cemetery_km', \
           'church_synagogue_km', 'detention_facility_km', 'exhibition_km', 'fitness_km', 'green_part_1000',\
           'green_part_1500', 'green_part_2000', 'green_part_3000', 'green_part_500', 'green_part_5000', \
           'green_zone_km', 'green_zone_part', 'hospice_morgue_km', 'ice_rink_km', 'incineration_km',\
           'indust_part', 'industrial_km', 'kindergarten_km', 'kremlin_km', 'life_sq', 'market_shop_km',\
           'metro_km_avto', 'metro_km_walk', 'metro_min_avto', 'metro_min_walk', 'mkad_km', 'mosque_km',\
           'museum_km', 'nuclear_reactor_km', 'num_room', 'office_km', 'oil_chemistry_km', 'park_km', \
           'power_transmission_line_km', 'preschool_km', 'preschool_quota', 'prom_part_1000', 'prom_part_1500',\
           'prom_part_2000', 'prom_part_3000', 'prom_part_5000', 'public_healthcare_km', \
           'public_transport_station_km', 'public_transport_station_min_walk', 'radiation_km', 'railroad_km', \
           'railroad_station_avto_km', 'railroad_station_avto_min', 'railroad_station_walk_km', \
           'railroad_station_walk_min', 'raion_build_count_with_builddate_info', \
           'raion_build_count_with_material_info', 'sadovoe_km', 'school_km', 'school_quota', \
           'shopping_centers_km', 'stadium_km', 'swim_pool_km', 'theater_km', 'thermal_power_plant_km',\
           'ts_km', 'ttk_km', 'university_km', 'water_km', 'water_treatment_km', 'workplaces_km', \
           'zd_vokzaly_avto_km', '0_13_all', '0_13_female', '0_13_male', '0_17_all', '0_17_female', \
           '0_17_male', '0_6_all', '0_6_female', '0_6_male', '16_29_all', '16_29_female', '16_29_male',\
           '7_14_all', '7_14_female', '7_14_male', 'ID_metro', 'ID_railroad_station_avto', 'cafe_count_1500',\
           'cafe_count_2000', 'cafe_count_2000_price_1000', 'cafe_count_3000', 'cafe_count_3000_price_1000', \
           'cafe_count_3000_price_1500', 'cafe_count_3000_price_2500', 'cafe_count_3000_price_500', \
           'cafe_count_5000', 'cafe_count_5000_price_1000', 'cafe_count_5000_price_1500', \
           'cafe_count_5000_price_2500', 'cafe_count_5000_price_500', 'children_preschool', \
           'children_school', 'ekder_all', 'ekder_female', 'ekder_male', 'female_f', 'full_all', \
           'full_sq', 'male_f', 'office_count_5000', 'office_sqm_1500', 'office_sqm_2000', \
           'office_sqm_3000', 'office_sqm_5000', 'raion_popul', 'sport_count_3000', 'sport_count_5000',\
           'trc_count_3000', 'trc_count_5000', 'trc_sqm_1000', 'trc_sqm_1500', 'trc_sqm_2000', 'trc_sqm_3000',\
           'trc_sqm_5000', 'work_all', 'work_female', 'work_male', 'young_all', 'young_female', 'young_male',\
           'diff_sq', 'plus_sq', 'km_sum']
len(ig_list)

162

In [6]:
ID_list = ['ID_railroad_terminal' ,
'ID_railroad_station_walk' ,
'ID_big_road1' ,
'ID_big_road2' ,
'ID_bus_terminal' ,
'ID_metro' ,
'ID_railroad_station_avto' ]

ID_list

['ID_railroad_terminal',
 'ID_railroad_station_walk',
 'ID_big_road1',
 'ID_big_road2',
 'ID_bus_terminal',
 'ID_metro',
 'ID_railroad_station_avto']

In [84]:
min_price_avg_list = []
for col in ig_list:
    if '_min_price_avg' in col:
        min_price_avg_list.append(col)
min_price_avg_list
df_train['cafe_sum_min_price_avg_ALL'] = pd.Series([sum(df_train.loc[i, min_price_avg_list])\
                                                    for i in range(df_train.shape[0])])

In [87]:
max_price_avg_list = []
for col in ig_list:
    if '_max_price_avg' in col:
        max_price_avg_list.append(col)
max_price_avg_list
df_train['cafe_sum_max_price_avg_ALL'] = pd.Series([sum(df_train.loc[i, max_price_avg_list])\
                                                    for i in range(df_train.shape[0])])

In [95]:
new_xlist = []
for col in x_list:
    if col not in max_price_avg_list + min_price_avg_list:
        new_xlist.append(col)
        
print(len(new_xlist))

731


In [22]:
# information gain --> want to verify that 1.0 is the highest value for info gain... 
# answer is 1.05 but that's probably just a rounding error. For all intents and purposes it's 1.0 ...
print (dt.datetime.now())
colz = ['prom_part_1000', 'prom_part_1500',\
           'prom_part_2000', 'prom_part_3000', 'prom_part_5000']
colx = ['prom_part_sum']
df_train['prom_part_sum'] = pd.Series([sum(df_train.loc[i, colz]) for i in range(df_train.shape[0])])
#colx = ['price_cat','diff_sq', 'plus_sq', 'cafe_sum_min_price_avg_ALL', 'cafe_sum_max_price_avg_ALL']
res = list(zip(mutual_info_classif(df_train[colx], df_train['price_cat']), colx))
print(sorted(res,reverse=True))
print (dt.datetime.now())

2018-11-20 17:08:55.937226
[(0.1765988731256436, 'prom_part_sum')]
2018-11-20 17:09:52.687095


In [84]:
x_list = [col for col in list(df_train.columns) if col not in skip]
len(x_list)

1343

In [7]:
if __name__ == "__main__":
    print (dt.datetime.now())
    skip = ['price_cat','id','timestamp','price_doc']
    dir_path = os.getcwd() + '/../../../data/all/'
    print (dir_path)
    df = pd.read_csv(dir_path + 'train.csv')
    df_macro = pd.read_csv(dir_path + 'macro.csv')

    df['price_length'] = [7.5 if cat > 6e6 and cat < 1e7 else len(str(cat)) for cat in df['price_doc'] ]    
    df['price_length'] = [l if l < 8 else 8 for l in df['price_length']]    
    df['price_cat'] = df['price_length'].astype('category').cat.codes    
    df['quarter'] = df['timestamp'].apply(lambda x: pd.Timestamp(x).quarter)
    
    df.loc[df.price_cat == 4, 'price_cat'] = 3
    df_eda = pd.merge(df, df_macro, how='inner', on='timestamp', left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)
    
    null_yr = 9999.0
    yr = dt.datetime.now().year
    # take care of bad dates, dates too old, too far out into the future, or NaN. Set them all to 9999.0
    for i in df_eda['build_year'].index:
        if df_eda.loc[i, 'build_year'] < 1700 \
                or np.isnan(df_eda.loc[i, 'build_year']) \
                or df_eda.loc[i, 'build_year'] > yr:
            df_eda.loc[i, 'build_year'] = null_yr

    df_eda.drop(['price_length','id', 'timestamp'], inplace=True, axis=1)
    df_eda.loc[df_eda[df_eda['state'] == 33.0].index, 'state'] = 3.0
    
    df_train, rest = fix_data(df_eda, skip)
    df_train['diff_sq'] = df_train['full_sq'] - df_train['kitch_sq']
    df_train['plus_sq'] = df_train['full_sq'] + df_train['life_sq'] 
    df_train['km_sum'] = pd.Series([sum(df_train.loc[i, km_list]) for i in range(df_train.shape[0])])

    x_list = [col for col in list(df_train.columns) if col not in skip]
    
    print (dt.datetime.now())

2018-11-20 16:13:09.293405
/Users/chadleonard/Springboard/work/springboard/capstone/projects/capstone1_Sberbank/../../../data/all/
2018-11-20 16:14:26.647829


In [8]:
len(x_list)

1342

In [41]:
# set all outliers greater than 3 (abs value) std devs from the mean to 3 or -3, respectively.
for col in rest:
    iz = df_train.loc[df_train[col] > 3, col].index
    iy = df_train.loc[df_train[col] < -3, col].index
    df_train.loc[iz, col] = 3.0
    df_train.loc[iy, col] = -3.0

In [10]:
ig_list = []
for l in res:
    if l[0] >= .1:
    #if l[0] >= .2:
        ig_list.append(l[1])
        
print(len(ig_list))

156


In [87]:
#x_list = ig_list
X_train, X_test, y_train, y_test = model_selection.train_test_split(df_train.loc[idx_list, x_list], \
                                                                df_train.loc[idx_list, 'price_cat'], test_size=0.3)
X_train.shape

(10264, 1343)

In [74]:
df_train.shape

(30471, 1344)

In [89]:
selector=SelectKBest(score_func=f_classif,k=40)
selector.fit(X_train,y_train)
#print("scores_:",selector.scores_)
#print("pvalues_:",selector.pvalues_)
#print("selected index:",selector.get_support(True))
new_df = pd.DataFrame(selector.transform(X_train))
print("after transform:",new_df) 

after transform:              0         1         2         3         4         5         6   \
0     -0.598095  0.246350 -0.380028 -0.454934 -0.531762 -0.498352 -0.550915   
1      1.764354  1.146256  2.650169  0.856621  1.525071  3.291423 -0.674468   
2     -0.264618 -0.533177 -0.065294  0.381850 -0.586664 -0.090877 -0.107354   
3     -0.202229 -0.202437 -0.516727 -0.671815 -1.010969 -0.672010 -0.551041   
4     -0.736765 -0.619212 -0.565733 -0.606498 -0.512912 -0.579773 -0.217714   
5     -0.458995  0.063459 -0.913468 -0.450465 -0.599099 -0.177221 -0.559539   
6      1.197128  0.665173  1.076640  3.001402  1.104286  0.501977  2.412182   
7      0.562008  0.182523  0.490431  1.544852  1.261182  0.282735  1.439806   
8     -0.517981 -0.319179 -0.602200 -0.816760  0.005472 -0.542088 -0.583607   
9     -0.834567 -0.415037 -0.612953 -0.497681 -0.548011 -0.781185 -0.531015   
10    -0.638568 -0.523315 -0.291144 -0.355237  0.079372 -0.587832 -0.517865   
11    -0.541029 -0.552813 -0.275420

 1153 1174 1190 1197 1200 1306 1312 1319 1322 1328 1329 1333 1334 1335] are constant.


In [90]:
import collections
od = collections.OrderedDict(sorted(dict(zip(selector.scores_, x_list)).items(),reverse=True))
len(od)

1107

In [54]:
y_train.value_counts()

2    3428
3    3343
1    3343
0     150
Name: price_cat, dtype: int64

In [56]:
idx0 = df_train[df_train['price_cat'] == 0].index
idx1 = df_train[df_train['price_cat'] == 1].index
idx2 = df_train[df_train['price_cat'] == 2].index
idx3 = df_train[df_train['price_cat'] == 3].index
print(len(idx1))
print(len(idx2))

13750
11678


In [57]:
idx_list = list(df_train.loc[idx1].sample(4810).index) + \
list(df_train.loc[idx2].sample(4810).index) + \
list(df_train.loc[idx3].sample(4810).index) + \
list(df_train.loc[idx0].sample(233).index)

len(idx_list)

14663

In [107]:
df_train['price_cat'].value_counts()

1    13750
2    11678
3     4810
0      233
Name: price_cat, dtype: int64

In [9]:
# information gain
print (dt.datetime.now())
res = list(zip(mutual_info_classif(df_train[x_list], df_train['price_cat']), x_list))
print(sorted(res,reverse=True))
print (dt.datetime.now())

2018-11-20 16:17:56.885355
[(0.2942888824757721, 'plus_sq'), (0.2764085914332337, 'diff_sq'), (0.2064760726605448, 'full_sq'), (0.20191972925437596, 'bulvar_ring_km'), (0.20104612871902194, 'trc_sqm_5000'), (0.20067771048941685, 'sadovoe_km'), (0.19728027035254492, 'kremlin_km'), (0.1958285017214807, 'zd_vokzaly_avto_km'), (0.19500805572137048, 'km_sum'), (0.19383762751939204, 'park_km'), (0.1931876403485293, 'cafe_avg_price_5000'), (0.19222991469254969, 'big_church_km'), (0.18987566908548348, 'oil_chemistry_km'), (0.18948060595818794, 'ts_km'), (0.18928974148154887, 'ttk_km'), (0.18912420970326282, 'swim_pool_km'), (0.1890306196801932, 'mosque_km'), (0.18894454364771174, 'school_km'), (0.18861186633883809, 'catering_km'), (0.1886113606362425, 'nuclear_reactor_km'), (0.1885839884238878, 'detention_facility_km'), (0.18836335078359223, 'power_transmission_line_km'), (0.18827305437748865, 'metro_km_walk'), (0.18704231967821672, 'office_km'), (0.1864907514084111, 'metro_km_avto'), (0.18637

In [161]:
# information gain --> want to verify that 1.0 is the highest value for info gain... 
# answer is 1.05 but that's probably just a rounding error. For all intents and purposes it's 1.0 ...
print (dt.datetime.now())
for c in new_idlist:
    #colx = ['price_cat','diff_sq', 'plus_sq', 'km_sum', c] 
    colx = [c]
    res = list(zip(mutual_info_classif(df_train[colx], df_train['price_cat'], discrete_features=False), colx))
    #print(sorted(res,reverse=True))
    #print(res[4][0])
    if res[0][0] >= .1:
        print(res[0])
print (dt.datetime.now())

2018-11-20 13:03:44.361024
2018-11-20 13:09:24.102185


In [26]:
X_train.shape

(10264, 156)

In [29]:
pca = PCA(n_components=4)
pca.fit(X_train, y_train)
print(pca.explained_variance_ratio_) 
df_pca = pd.DataFrame(pca.components_)
df_pca = df_pca.T
df_pca.head()
df_pca['price_cat'] = y_train
df_pca.head()

[0.91631618 0.01936983 0.0146216  0.00632571]


Unnamed: 0,0,1,2,3,price_cat
0,0.018874,0.018434,0.003948,0.037242,
1,0.018723,0.009193,-0.002402,0.101526,
2,0.025314,0.001319,-0.001615,-0.004686,
3,0.022988,0.007974,0.003869,0.005698,3.0
4,0.013424,-0.023305,0.027166,-0.072643,3.0


In [92]:
print (dt.datetime.now())
pca = PCA(n_components=20)# adjust yourself
pca.fit(X_train)
X_t_train = pca.transform(X_train)
X_t_test = pca.transform(X_test)
clf = RandomForestClassifier(criterion='entropy', n_estimators=500)
clf.fit(X_t_train, y_train)
print ('score', clf.score(X_t_test, y_test))
print ('pred label', clf.predict(X_t_test))
print (dt.datetime.now())

2018-11-20 21:50:05.670015
score 0.7515344396453739
pred label [3 3 1 ... 2 3 3]
2018-11-20 21:50:43.682135


In [41]:
print (dt.datetime.now())
pca = PCA(n_components=150)# adjust yourself
pca.fit(X_train)
X_t_train = pca.transform(X_train)
X_t_test = pca.transform(X_test)
clf = XGBClassifier(objective='multi:softmax', n_estimators=500)
clf.fit(X_t_train, y_train)
print ('score', clf.score(X_t_test, y_test))
print ('pred label', clf.predict(X_t_test))
print (dt.datetime.now())

2018-11-20 18:31:57.285884
score 0.758808820186406
pred label [2 2 2 ... 2 1 2]
2018-11-20 18:35:55.206863


In [91]:
# test model where data is no more than 3 std devs (in either direction) from mean for continuous data
print (dt.datetime.now())
pipe = Pipeline([
  #('pca', PCA()),
  ('clf', RandomForestClassifier(criterion='entropy'))
])

cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.3)
param_grid = { "clf__n_estimators": [500]} #, "pca__n_components": [5,10] , "pca__whiten": [True,False] }
pipe_est_cv = model_selection.GridSearchCV(pipe, param_grid=param_grid, cv=cv, scoring='accuracy', \
                                           verbose=3, n_jobs=5)

#pipe_est_cv.fit(X_train, y_train)
pipe_est_cv.fit(new_df, y_train)
#pipe_est_cv.fit(df_train[ig_list], df_train['price_cat'])
print(pipe_est_cv.best_params_)
print(pipe_est_cv.score(new_df, y_train)) # 1.0  10 splits
print(pipe_est_cv.score(selector.transform(X_test), y_test))
#print(pipe_est_cv.score(X_test, y_test))   # 0.7908553926930649
print (dt.datetime.now())

2018-11-20 21:46:34.847649
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] clf__n_estimators=500 ...........................................
[CV] clf__n_estimators=500 ...........................................
[CV] clf__n_estimators=500 ...........................................
[CV] clf__n_estimators=500 ...........................................
[CV] clf__n_estimators=500 ...........................................
[CV] .. clf__n_estimators=500, score=0.7840909090909091, total=  23.6s
[CV] clf__n_estimators=500 ...........................................
[CV] .. clf__n_estimators=500, score=0.7782467532467533, total=  23.9s
[CV] .. clf__n_estimators=500, score=0.7918831168831169, total=  23.8s
[CV] clf__n_estimators=500 ...........................................
[CV] .. clf__n_estimators=500, score=0.7892857142857143, total=  23.8s
[CV] clf__n_estimators=500 ...........................................
[CV] clf__n_estimators=500 .................................

[Parallel(n_jobs=5)]: Done   5 out of  10 | elapsed:   26.3s remaining:   26.3s


[CV] ... clf__n_estimators=500, score=0.797077922077922, total=  23.7s
[CV] .. clf__n_estimators=500, score=0.7902597402597402, total=  23.7s
[CV] .. clf__n_estimators=500, score=0.7831168831168831, total=  23.6s
[CV] .. clf__n_estimators=500, score=0.7948051948051948, total=  23.6s
[CV] .. clf__n_estimators=500, score=0.7909090909090909, total=  23.4s


[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:   50.5s finished


{'clf__n_estimators': 500}
0.9928877630553391
0.7842691520800182
2018-11-20 21:47:54.870393


In [81]:
pipe_est_cv_results = pipe_est_cv.cv_results_
print(pipe_est_cv_results['mean_train_score'])
print(pipe_est_cv_results['mean_test_score'])
print(pipe_est_cv.score(selector.transform(X_test), y_test))



[0.9877046]
[0.78536031]
0.9847692657422141


In [93]:
# First XGBoost Classifier model
print (dt.datetime.now())
pipe = Pipeline([
      # ('scaling', preprocessing.StandardScaler()),
       #('pca', PCA(whiten=True)),
       ('clf', XGBClassifier()) 
    ])

cv = model_selection.ShuffleSplit(n_splits=5, test_size=0.2)
param_grid = { "clf__n_estimators": [200], \
              "clf__objective": ['multi:softmax'] , "clf__max_depth": [20]}
pipe_est_cv = model_selection.GridSearchCV(pipe, param_grid=param_grid, cv=cv, \
                                           scoring='accuracy', verbose=3, n_jobs=5)

pipe_est_cv.fit(X_train, y_train)
pipe_est_cv_results = pipe_est_cv.cv_results_

print(pipe_est_cv.best_params_)
print(pipe_est_cv.best_score_)

print(pipe_est_cv_results['mean_train_score'])
print(pipe_est_cv_results['mean_test_score'])
print (dt.datetime.now())

2018-11-20 10:55:12.922932
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] clf__max_depth=20, clf__n_estimators=200, clf__objective=multi:softmax 
[CV] clf__max_depth=20, clf__n_estimators=200, clf__objective=multi:softmax 
[CV] clf__max_depth=20, clf__n_estimators=200, clf__objective=multi:softmax 
[CV] clf__max_depth=20, clf__n_estimators=200, clf__objective=multi:softmax 
[CV] clf__max_depth=20, clf__n_estimators=200, clf__objective=multi:softmax 
[CV]  clf__max_depth=20, clf__n_estimators=200, clf__objective=multi:softmax, score=0.7986404125644632, total=33.4min
[CV]  clf__max_depth=20, clf__n_estimators=200, clf__objective=multi:softmax, score=0.8082512892639475, total=33.5min


[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 33.6min remaining: 50.4min


[CV]  clf__max_depth=20, clf__n_estimators=200, clf__objective=multi:softmax, score=0.8089545241443976, total=33.6min
[CV]  clf__max_depth=20, clf__n_estimators=200, clf__objective=multi:softmax, score=0.8087201125175809, total=33.6min
[CV]  clf__max_depth=20, clf__n_estimators=200, clf__objective=multi:softmax, score=0.8037974683544303, total=33.7min


[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 33.8min finished


{'clf__max_depth': 20, 'clf__n_estimators': 200, 'clf__objective': 'multi:softmax'}
0.8056727613689639
[1.]
[0.80567276]
2018-11-20 12:03:47.110039




In [16]:
y_out = list(pipe_est_cv.predict(X_test))
y_out[:5]

[1, 3, 2, 2, 3]

In [17]:
result = []
y_in = list(y_test)
for a in range(len(y_in)):
    if y_in[a] != y_out[a]:
        result.append((y_in[a], y_out[a]))
print(len(result))
#sorted(result)

919


In [18]:
d = defaultdict(int)
for a in result:
    d[a] += 1
    
ys = [(v,k) for k,v in d.items()]
sorted(ys,reverse=True)

[(289, (1, 2)),
 (215, (2, 1)),
 (129, (3, 2)),
 (119, (2, 3)),
 (72, (1, 3)),
 (29, (0, 1)),
 (28, (0, 2)),
 (12, (3, 1)),
 (9, (3, 0)),
 (9, (0, 3)),
 (4, (2, 0)),
 (4, (1, 0))]

In [105]:
# information gain --> want to verify that 1.0 is the highest value for info gain... 
# answer is 1.05 but that's probably just a rounding error. For all intents and purposes it's 1.0 ...
print (dt.datetime.now())
df1['diff_sq'] = df1['full_sq'] - df1['life_sq']
colx = ['price_cat','price_doc', 'area_m', 'diff_sq']
res = list(zip(mutual_info_classif(df1[colx], df1['price_cat']), colx))
print(sorted(res,reverse=True))
print (dt.datetime.now())

2018-11-19 13:36:21.710964
[(1.0578597018780778, 'price_cat'), (1.054515396339213, 'price_doc'), (0.15398174695474132, 'area_m'), (0.13284162703609503, 'diff_sq')]
2018-11-19 13:36:24.936279


In [None]:
price_doc: sale price (this is the target variable)
id: transaction id
timestamp: date of transaction
full_sq: total area in square meters, including loggias, balconies and other non-residential areas
life_sq: living area in square meters, excluding loggias, balconies and other non-residential areas
floor: for apartments, floor of the building
max_floor: number of floors in the building
material: wall material
build_year: year built
num_room: number of living rooms
kitch_sq: kitchen area
state: apartment condition
product_type: owner-occupier purchase or investment
sub_area: name of the district


In [124]:
df1['product_type'] = df['product_type'].astype('category').cat.codes

In [157]:
# information gain --> want to verify that 1.0 is the highest value for info gain... 
# answer is 1.05 but that's probably just a rounding error. For all intents and purposes it's 1.0 ...
print (dt.datetime.now())
df1['diff_sq'] = df1['full_sq'] - df1['kitch_sq']
df1['plus_sq'] = df1['full_sq'] + df1['life_sq'] #+ df1['kitch_sq']
df1['sub_area'] = df['sub_area'].astype('category').cat.codes
colx = ['price_cat','price_doc', 'diff_sq', 'plus_sq'\
       ,'num_room', 'sub_area']
res = list(zip(mutual_info_classif(df1[colx], df1['price_cat'], discrete_features=False), colx))
print(sorted(res,reverse=True))
print (dt.datetime.now())

2018-11-19 14:20:24.339551
[(1.0577776566547508, 'price_cat'), (1.0544989872945476, 'price_doc'), (0.19406288855386378, 'plus_sq'), (0.18125053277230507, 'diff_sq'), (0.1601665206825107, 'sub_area'), (0.11116952979989203, 'num_room')]
2018-11-19 14:20:26.912005


In [130]:
df1['build_year'].value_counts()
pass