In [1]:
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer, OneHotEncoder, LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor

from patsy import dmatrix

import statsmodels.api as sm
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

# Read the data from csv file

In [2]:
'''Import the train and test data set'''
data_train = pd.read_csv('train.csv')
data_train.shape

(21570, 292)

# Column name preprocessing to prevent R style crashes

In [3]:
def convert_column(data_train):
    data_train_columns = list(data_train.columns.values)
    for i in range(len(data_train_columns)):
        if '-' in data_train_columns[i]:
            data_train_columns[i] = data_train_columns[i].replace('-', '_')
    return data_train_columns

In [4]:
'''Some of the column names contain - signs which crashes the OLS algorithm'''
'''https://www.tutorialspoint.com/python/string_replace.htm'''
data_train_columns = convert_column(data_train)
data_train.columns = data_train_columns

X = data_train.iloc[:, 2:291]
y = data_train.iloc[:, 291]

# View how many data points are missing

In [5]:
'''This variable is used to check how much data is missing'''
missing_data_count = data_train.isnull().sum()
missing_data_count.head()

id              0
timestamp       0
full_sq         0
life_sq      4539
floor         124
dtype: int64

# Some label encoding and data filtering to convert non-numerical values to numerical values and to get rid of information that does not make sense.

In [6]:
def label_encoding(X):
    '''Label Encoding and mapping'''
    
    #sub_area encoding
    sub_area_le = LabelEncoder()
    X['sub_area'] = sub_area_le.fit_transform(X['sub_area'])
    
    #product_type encoding
    product_type_le = LabelEncoder()
    X['product_type'] = product_type_le.fit_transform(X['product_type'])
    
    #All the yes_no encoding
    yes_no_le = LabelEncoder()
    yes_no_list = ['thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 
       'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion', 
       'water_1line', 'big_road1_1line', 'railroad_1line', 'culture_objects_top_25']
    for column in yes_no_list:
        X[column] = yes_no_le.fit_transform(X[column])
    
    #ecology mapping
    X['ecology'].unique()
    ecology_mapping = {'excellent' : 4, 'good' : 3, 'satisfactory' : 2, 'poor' : 1, 'no data' : np.nan}
    X['ecology'] = X['ecology'].map(ecology_mapping)
    
    encoder_dict = {'sub_area' : sub_area_le, 'product_type' : product_type_le, 'yes_no' : yes_no_le,
                    'ecology' : ecology_mapping}  
    return X, encoder_dict

def simple_filter(X):
    X.loc[ (X.build_year > 2015) | (X.build_year < 1600 ), 'build_year'] = np.nan
    X.loc[ (X.state > 4) | (X.state < 1), 'state'] = np.nan
    return X

In [7]:
'''Perform encoding and mapping'''
X, encoder_dict = label_encoding(data_train.iloc[:, 2:])

'''Drop all the data with nan terms'''
X_drop = X.dropna()

'''Data description'''
data_describe = X_drop.describe()

'''build_year and state obviously need some modification'''
X_drop = simple_filter(X_drop)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
X_drop.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
5696,11,11.0,2.0,5.0,2.0,1907.0,1.0,12.0,3.0,0,...,339,135,26,133,207,1,89,161,10,2750000
5864,77,50.0,3.0,5.0,2.0,1957.0,3.0,8.0,2.0,0,...,214,85,21,48,89,1,54,146,12,11700000
5936,31,21.0,5.0,9.0,5.0,1962.0,1.0,5.0,3.0,0,...,99,43,8,17,41,1,18,93,5,6200000
5951,43,27.0,1.0,5.0,2.0,1960.0,2.0,5.0,2.0,0,...,24,4,0,20,40,0,10,101,7,5950000
5962,46,29.0,9.0,9.0,2.0,1969.0,2.0,5.0,3.0,0,...,7,4,0,9,15,0,0,42,3,5900000


In [9]:
X_drop.shape

(4218, 290)

# Drop the filtered nan values and reset the index

### Reference: https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-data-frame

In [10]:
'''Once again drop the nan terms that were filtered, and look at the data 
description to make sure the filtration has been carried out properly'''
'''Reference: https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-data-frame'''
X_drop = X_drop.dropna().reset_index(drop = True)
data_describe = X_drop.describe()
X_drop.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,11,11.0,2.0,5.0,2.0,1907.0,1.0,12.0,3.0,0,...,339,135,26,133,207,1,89,161,10,2750000
1,77,50.0,3.0,5.0,2.0,1957.0,3.0,8.0,2.0,0,...,214,85,21,48,89,1,54,146,12,11700000
2,31,21.0,5.0,9.0,5.0,1962.0,1.0,5.0,3.0,0,...,99,43,8,17,41,1,18,93,5,6200000
3,43,27.0,1.0,5.0,2.0,1960.0,2.0,5.0,2.0,0,...,24,4,0,20,40,0,10,101,7,5950000
4,46,29.0,9.0,9.0,2.0,1969.0,2.0,5.0,3.0,0,...,7,4,0,9,15,0,0,42,3,5900000


In [11]:
X_drop.shape

(4110, 290)

# Check if anymore strings are left
### Reference: https://stackoverflow.com/questions/21771133/finding-non-numeric-rows-in-dataframe-in-pandas

In [12]:
print(X_drop[~X_drop.applymap(np.isreal).all(1)])

Empty DataFrame
Columns: [full_sq, life_sq, floor, max_floor, material, build_year, num_room, kitch_sq, state, product_type, sub_area, area_m, raion_popul, green_zone_part, indust_part, children_preschool, preschool_quota, preschool_education_centers_raion, children_school, school_quota, school_education_centers_raion, school_education_centers_top_20_raion, hospital_beds_raion, healthcare_centers_raion, university_top_20_raion, sport_objects_raion, additional_education_raion, culture_objects_top_25, culture_objects_top_25_raion, shopping_centers_raion, office_raion, thermal_power_plant_raion, incineration_raion, oil_chemistry_raion, radiation_raion, railroad_terminal_raion, big_market_raion, nuclear_reactor_raion, detention_facility_raion, full_all, male_f, female_f, young_all, young_male, young_female, work_all, work_male, work_female, ekder_all, ekder_male, ekder_female, 0_6_all, 0_6_male, 0_6_female, 7_14_all, 7_14_male, 7_14_female, 0_17_all, 0_17_male, 0_17_female, 16_29_all, 16_2

# Separate data into different types and re-partitioning the dataframe

In [13]:
#Nominal category values
nom_categorical_list =['material', 'product_type', 'sub_area', 'ID_metro', 'ID_railroad_station_walk',
                   'ID_railroad_station_avto', 'ID_big_road1', 
                   'ID_big_road2', 'ID_railroad_terminal', 'ID_bus_terminal',  
                   'thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 
                   'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 
                   'detention_facility_raion', 'water_1line', 'big_road1_1line', 'railroad_1line', 
                   'culture_objects_top_25']

#Ordinal category values
ord_categorical_list = ['state', 'ecology']

#target/y values
target = 'price_doc'

#Numerical values 
filter_list = nom_categorical_list + ord_categorical_list + [target]
numeric_list = list(X_drop.columns)
for item in filter_list:
    numeric_list.remove(item)
    
'''Re-partitioning the DataFrame numeric -> ord_cat -> nom_cat'''
X_drop = pd.concat([X_drop[numeric_list], X_drop[ord_categorical_list], 
                    X_drop[nom_categorical_list], X_drop[target]], axis = 1)
X_drop.tail()

Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,area_m,raion_popul,green_zone_part,...,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,water_1line,big_road1_1line,railroad_1line,culture_objects_top_25,price_doc
4105,38,19.0,7.0,17.0,2006.0,1.0,7.0,17526510.0,118843,0.133373,...,0,0,0,0,0,0,0,0,0,6800000
4106,54,32.0,8.0,16.0,1979.0,2.0,9.0,24813850.0,174831,0.683844,...,1,0,0,0,0,0,0,0,0,9500000
4107,56,29.0,13.0,14.0,2001.0,2.0,11.0,9249237.0,156377,0.374068,...,0,0,0,0,0,0,0,0,0,12000000
4108,64,32.0,5.0,15.0,2003.0,2.0,11.0,6050065.0,78616,0.167526,...,1,0,0,0,0,0,0,0,0,13500000
4109,43,28.0,1.0,9.0,1968.0,2.0,6.0,4395333.0,94561,0.063755,...,1,0,0,0,0,0,0,0,0,5600000


# Standardize the numerical and ordinal category data

### References on how to handle ordinal category features:
### https://learnandteachstatistics.wordpress.com/2013/07/08/ordinal/
### https://www.ma.utexas.edu/users/mks/statmistakes/ordinal.html
### https://www3.nd.edu/~rwilliam/stats3/OrdinalIndependent.pdf

In [14]:
'''Apply scaling to numeric value and declare a variable for the new standardized matrix'''
sc = StandardScaler()
X_drop_sc = X_drop[:]
X_drop_sc[numeric_list + ord_categorical_list] = sc.fit_transform(X_drop_sc[numeric_list + ord_categorical_list])

# VIF analysis for variables with high correlation

### Reference: http://support.minitab.com/en-us/minitab/17/topic-library/modeling-statistics/regression-and-correlation/model-assumptions/what-is-a-variance-inflation-factor-vif/

In [15]:
def VIF_analysis(X):
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    return vif

In [16]:
vif_metro = VIF_analysis(X_drop_sc[['metro_min_avto', 'metro_km_avto', 'metro_km_walk', 'metro_min_walk']])
vif_railroad_avto = VIF_analysis(X_drop_sc[['railroad_station_avto_min', 'railroad_station_avto_km']])
vif_railroad_walk = VIF_analysis(X_drop_sc[['railroad_station_walk_min', 'railroad_station_walk_km']])
vif_population = VIF_analysis(X_drop_sc[['raion_popul', 'children_school', 'children_preschool', 'full_all', 'male_f', 
                           'female_f', 'young_all', 'young_male', 'young_female', 'work_all', 'work_male', 
                           'work_female', 'ekder_all', 'ekder_male', 'ekder_female', '0_6_all', '0_6_male', 
                           '0_6_female', '7_14_all', '7_14_male', '7_14_female', '0_17_all', '0_17_male', 
                           '0_17_female', '16_29_all', '16_29_male', '16_29_female', '0_13_all', '0_13_male',
                           '0_13_female']])

In [17]:
vif_metro.head()

Unnamed: 0,VIF Factor,features
0,7.736992,metro_min_avto
1,13.625832,metro_km_avto
2,inf,metro_km_walk
3,inf,metro_min_walk


In [18]:
vif_railroad_avto.head()

Unnamed: 0,VIF Factor,features
0,12.670775,railroad_station_avto_min
1,12.670775,railroad_station_avto_km


In [19]:
vif_railroad_walk.head()

Unnamed: 0,VIF Factor,features
0,inf,railroad_station_walk_min
1,inf,railroad_station_walk_km


In [20]:
vif_population = vif_population.sort_values('VIF Factor')
vif_population

Unnamed: 0,VIF Factor,features
1,2364.661,children_school
0,2450.194,raion_popul
4,10198.48,male_f
7,21233.13,young_male
9,31006.12,work_all
6,64298.74,young_all
8,90593.37,young_female
11,837874.2,work_female
26,inf,16_29_female
19,inf,7_14_male


In [21]:
'''Based on VIF analysis following features will be dropped'''
vif_delete_list = list(vif_metro.loc[vif_metro['VIF Factor'] > 10, 'features']) + \
                  list(vif_population.loc[vif_population['VIF Factor'] > 10**7 , 'features'])

In [22]:
'''Drop above features from X_drop and update the numeric list'''
'''Do it for both standardized and not standardized data'''
'''https://stackoverflow.com/questions/28538536/deleting-multiple-columns-in-pandas'''
X_drop_sc.drop(vif_delete_list, axis = 1, inplace = True)
numeric_list_vif = numeric_list[:]
for item in vif_delete_list:
    numeric_list_vif.remove(item)

X_drop.drop(vif_delete_list, axis = 1, inplace = True)

In [23]:
X_drop.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,area_m,raion_popul,green_zone_part,...,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,water_1line,big_road1_1line,railroad_1line,culture_objects_top_25,price_doc
0,11,11.0,2.0,5.0,1907.0,1.0,12.0,10071560.0,102726,0.048791,...,1,0,0,0,0,0,0,0,1,2750000
1,77,50.0,3.0,5.0,1957.0,3.0,8.0,4662813.0,73007,0.109947,...,0,0,0,0,0,0,0,0,0,11700000
2,31,21.0,5.0,9.0,1962.0,1.0,5.0,18800000.0,125111,0.169625,...,1,0,0,0,0,0,0,0,1,6200000
3,43,27.0,1.0,5.0,1960.0,2.0,5.0,5586343.0,83502,0.08622,...,0,0,0,0,0,0,0,0,0,5950000
4,46,29.0,9.0,9.0,1969.0,2.0,5.0,7126815.0,87713,0.125882,...,0,0,0,1,0,0,0,0,0,5900000


# dmatrix format string manipulation

In [24]:
'''Compile a string that will be fed into sm.OLS.from_formula'''
'''Handling numeric and ord categorical information. Add this too!'''
numeric_ord_cat_OLS_string = " + ".join(numeric_list_vif + ord_categorical_list)

'''Handling the nominal categorical information with C()'''
category_OLS_list = []
for item in nom_categorical_list:
    category_OLS_list.append("C(" + item + ")")
category_OLS_string = " + ".join(category_OLS_list) 

'''This is the total string'''
OLS_string = numeric_ord_cat_OLS_string + " + " + category_OLS_string

In [25]:
print(OLS_string)

full_sq + life_sq + floor + max_floor + build_year + num_room + kitch_sq + area_m + raion_popul + green_zone_part + indust_part + preschool_quota + preschool_education_centers_raion + children_school + school_quota + school_education_centers_raion + school_education_centers_top_20_raion + hospital_beds_raion + healthcare_centers_raion + university_top_20_raion + sport_objects_raion + additional_education_raion + culture_objects_top_25_raion + shopping_centers_raion + office_raion + male_f + young_all + young_male + young_female + work_all + work_female + raion_build_count_with_material_info + build_count_block + build_count_wood + build_count_frame + build_count_brick + build_count_monolith + build_count_panel + build_count_foam + build_count_slag + build_count_mix + raion_build_count_with_builddate_info + build_count_before_1920 + build_count_1921_1945 + build_count_1946_1970 + build_count_1971_1995 + build_count_after_1995 + metro_min_avto + kindergarten_km + school_km + park_km + gr

# Constructing first set of variables using dmatrix encoding

In [26]:
'''Create a dmatrix for encoding and extract the column variables so they can
be used to access features with low p values'''
'''https://stackoverflow.com/questions/23560104/fetching-names-from-designmatrix-in-patsy'''
dfX = dmatrix(OLS_string, data = X_drop)
dfX_columns = dfX.design_info.column_names
dfX = pd.DataFrame(dfX, columns = dfX_columns)

dfY = pd.DataFrame(X_drop_sc['price_doc'])

# First OLS Run

In [27]:
'''Finally first OLS run'''
model1 = sm.OLS(dfY, dfX)
result1 = model1.fit()
print(result1.summary())

                            OLS Regression Results                            
Dep. Variable:              price_doc   R-squared:                       0.715
Model:                            OLS   Adj. R-squared:                  0.658
Method:                 Least Squares   F-statistic:                     12.52
Date:                Wed, 25 Oct 2017   Prob (F-statistic):               0.00
Time:                        10:45:07   Log-Likelihood:                -67569.
No. Observations:                4110   AIC:                         1.365e+05
Df Residuals:                    3424   BIC:                         1.408e+05
Df Model:                         685                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------

# Extract features with p value less than confidence interval of 5%

### Reference: https://stackoverflow.com/questions/37787698/how-to-sort-pandas-dataframe-from-one-column

In [28]:
def p_value_extract(result, confidence):
    '''https://stackoverflow.com/questions/37787698/how-to-sort-pandas-dataframe-from-one-column'''
    p_values = pd.DataFrame(result.pvalues)
    p_values.columns = ['p_value']
    p_values['p_value'] = p_values[p_values.p_value < confidence]
    p_values = p_values.dropna().sort_values('p_value')
    return p_values

In [29]:
p_values1 = p_value_extract(result1, 0.01)
p_values1.shape

(130, 1)

# Feature extraction from first OLS p values and second run of OLS

In [30]:
'''Rerun the OLS with extracted features'''
extracted_feature1 = list(p_values1.index.values)
X_drop_sc_extract1 = dfX.loc[:, extracted_feature1]
dfX2 = sm.add_constant(X_drop_sc_extract1)
dfX2.tail()

Unnamed: 0,const,full_sq,C(ID_railroad_station_avto)[T.69],C(ID_metro)[T.204],floor,kitch_sq,C(ID_metro)[T.125],C(ID_metro)[T.78],C(sub_area)[T.83],state,...,C(sub_area)[T.1],C(ID_railroad_station_walk)[T.69.0],cafe_count_2000_price_2500,C(ID_metro)[T.181],C(ID_metro)[T.50],C(ID_metro)[T.153],leisure_count_3000,C(ID_metro)[T.193],build_count_slag,C(ID_railroad_station_walk)[T.84.0]
4105,1,38.0,0.0,0.0,7.0,7.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4106,1,54.0,0.0,0.0,8.0,9.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
4107,1,56.0,0.0,0.0,13.0,11.0,0.0,0.0,0.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4108,1,64.0,0.0,0.0,5.0,11.0,0.0,0.0,0.0,2.0,...,0.0,0.0,9.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4109,1,43.0,0.0,0.0,1.0,6.0,0.0,0.0,0.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0


In [31]:
'''Second OLS after initial feature extraction'''
model2 = sm.OLS(dfY, dfX2)
result2 = model2.fit()
print(result2.summary())

                            OLS Regression Results                            
Dep. Variable:              price_doc   R-squared:                       0.624
Model:                            OLS   Adj. R-squared:                  0.612
Method:                 Least Squares   F-statistic:                     52.43
Date:                Wed, 25 Oct 2017   Prob (F-statistic):               0.00
Time:                        10:45:07   Log-Likelihood:                -68137.
No. Observations:                4110   AIC:                         1.365e+05
Df Residuals:                    3983   BIC:                         1.373e+05
Df Model:                         126                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------


In [32]:
p_values2 = p_value_extract(result2, 0.01)
p_values2.shape

(47, 1)

# Feature extraction from second OLS p values and third run of OLS

In [33]:
'''Rerun the OLS with extracted features'''
extracted_feature2 = list(p_values2.index.values)
X_drop_sc_extract2 = dfX2.loc[:, extracted_feature2]
dfX3 = X_drop_sc_extract2

'''Second OLS after initial feature extraction'''
model3 = sm.OLS(dfY, dfX3)
result3 = model3.fit()
print(result3.summary())

                            OLS Regression Results                            
Dep. Variable:              price_doc   R-squared:                       0.604
Model:                            OLS   Adj. R-squared:                  0.600
Method:                 Least Squares   F-statistic:                     141.1
Date:                Wed, 25 Oct 2017   Prob (F-statistic):               0.00
Time:                        10:45:07   Log-Likelihood:                -68241.
No. Observations:                4110   AIC:                         1.366e+05
Df Residuals:                    4065   BIC:                         1.369e+05
Df Model:                          44                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------------
fu

In [34]:
p_values3 = p_value_extract(result3, 0.05)
p_values3.shape

(44, 1)

# Feature extraction from third OLS p values and fourth run of OLS

In [35]:
'''Rerun the OLS with extracted features'''
extracted_feature3 = list(p_values3.index.values)
X_drop_sc_extract3 = dfX3.loc[:, extracted_feature3]
dfX4 = X_drop_sc_extract3

'''Second OLS after initial feature extraction'''
model4 = sm.OLS(dfY, dfX4)
result4 = model4.fit()
print(result4.summary())

                            OLS Regression Results                            
Dep. Variable:              price_doc   R-squared:                       0.603
Model:                            OLS   Adj. R-squared:                  0.599
Method:                 Least Squares   F-statistic:                     150.8
Date:                Wed, 25 Oct 2017   Prob (F-statistic):               0.00
Time:                        10:45:08   Log-Likelihood:                -68248.
No. Observations:                4110   AIC:                         1.366e+05
Df Residuals:                    4068   BIC:                         1.368e+05
Df Model:                          41                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------------
fu

In [36]:
p_values4 = p_value_extract(result4, 0.01)
p_values4.shape

(38, 1)

# Feature extraction from fourth OLS p values and fifth run of OLS

In [37]:
'''Rerun the OLS with extracted features'''
extracted_feature4 = list(p_values4.index.values)
X_drop_sc_extract4 = dfX4.loc[:, extracted_feature4]
dfX5 = X_drop_sc_extract4

'''Second OLS after initial feature extraction'''
model5 = sm.OLS(dfY, dfX5)
result5 = model5.fit()
print(result5.summary())

                            OLS Regression Results                            
Dep. Variable:              price_doc   R-squared:                       0.601
Model:                            OLS   Adj. R-squared:                  0.598
Method:                 Least Squares   F-statistic:                     170.5
Date:                Wed, 25 Oct 2017   Prob (F-statistic):               0.00
Time:                        10:45:08   Log-Likelihood:                -68258.
No. Observations:                4110   AIC:                         1.366e+05
Df Residuals:                    4073   BIC:                         1.368e+05
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------------
fu

In [38]:
p_values5 = p_value_extract(result5, 0.01)
p_values5.shape

(35, 1)

# Feature extraction from 5th OLS p values and 6th run of OLS

In [39]:
'''Rerun the OLS with extracted features'''
extracted_feature5 = list(p_values5.index.values)
X_drop_sc_extract5 = dfX5.loc[:, extracted_feature5]
dfX6 = X_drop_sc_extract5

'''Second OLS after initial feature extraction'''
model6 = sm.OLS(dfY, dfX6)
result6 = model6.fit()
print(result6.summary())

                            OLS Regression Results                            
Dep. Variable:              price_doc   R-squared:                       0.599
Model:                            OLS   Adj. R-squared:                  0.596
Method:                 Least Squares   F-statistic:                     184.7
Date:                Wed, 25 Oct 2017   Prob (F-statistic):               0.00
Time:                        10:45:08   Log-Likelihood:                -68267.
No. Observations:                4110   AIC:                         1.366e+05
Df Residuals:                    4076   BIC:                         1.368e+05
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------------
fu

In [40]:
p_values6 = p_value_extract(result6, 0.01)
p_values6.shape

(34, 1)

# Feature extraction from 6th OLS p values and 7th run of OLS

In [41]:
'''Rerun the OLS with extracted features'''
extracted_feature6 = list(p_values6.index.values)
X_drop_sc_extract6 = dfX6.loc[:, extracted_feature6]
dfX7 = X_drop_sc_extract6

'''Second OLS after initial feature extraction'''
model7 = sm.OLS(dfY, dfX7)
result7 = model7.fit()
print(result7.summary())

                            OLS Regression Results                            
Dep. Variable:              price_doc   R-squared:                       0.599
Model:                            OLS   Adj. R-squared:                  0.596
Method:                 Least Squares   F-statistic:                     190.2
Date:                Wed, 25 Oct 2017   Prob (F-statistic):               0.00
Time:                        10:45:08   Log-Likelihood:                -68269.
No. Observations:                4110   AIC:                         1.366e+05
Df Residuals:                    4077   BIC:                         1.368e+05
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------------
fu

In [42]:
p_values7 = p_value_extract(result7, 0.01)
p_values7.shape

(34, 1)

# Feature extraction from 7th OLS p values and 8th run of OLS

In [43]:
'''Rerun the OLS with extracted features'''
extracted_feature7 = list(p_values7.index.values)
X_drop_sc_extract7 = dfX6.loc[:, extracted_feature7]
dfX8 = X_drop_sc_extract7

'''Second OLS after initial feature extraction'''
model8 = sm.OLS(dfY, dfX8)
result8 = model8.fit()
print(result8.summary())

                            OLS Regression Results                            
Dep. Variable:              price_doc   R-squared:                       0.599
Model:                            OLS   Adj. R-squared:                  0.596
Method:                 Least Squares   F-statistic:                     190.2
Date:                Wed, 25 Oct 2017   Prob (F-statistic):               0.00
Time:                        10:45:08   Log-Likelihood:                -68269.
No. Observations:                4110   AIC:                         1.366e+05
Df Residuals:                    4077   BIC:                         1.368e+05
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------------
fu

In [44]:
p_values8 = p_value_extract(result8, 0.01)
p_values8.shape

(34, 1)

# Feature extraction from 8th OLS p values and 9th run of OLS

In [45]:
'''Rerun the OLS with extracted features'''
extracted_feature8 = list(p_values8.index.values)
X_drop_sc_extract8 = dfX8.loc[:, extracted_feature8]
dfX9 = X_drop_sc_extract8

'''Second OLS after initial feature extraction'''
model9 = sm.OLS(dfY, dfX9)
result9 = model9.fit()
print(result9.summary())

                            OLS Regression Results                            
Dep. Variable:              price_doc   R-squared:                       0.599
Model:                            OLS   Adj. R-squared:                  0.596
Method:                 Least Squares   F-statistic:                     190.2
Date:                Wed, 25 Oct 2017   Prob (F-statistic):               0.00
Time:                        10:45:08   Log-Likelihood:                -68269.
No. Observations:                4110   AIC:                         1.366e+05
Df Residuals:                    4077   BIC:                         1.368e+05
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------------
fu

In [46]:
p_values9 = p_value_extract(result9, 0.01)
p_values9.shape

(34, 1)