In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split

In [18]:
feature_sele_df = pd.read_pickle('step_4_df.pkl')

# Feature Selection

In [29]:

# Example data preparation
# Assuming X and y are your feature matrix and target variable
# X, y = your_data_preparation_function()
X = feature_sele_df.drop('Price', axis=1)  # Features
feature_sele_df['log_price'] = np.log(feature_sele_df['Price'])

y = feature_sele_df['log_price']  # Target variable
X.shape

(14250, 104)

In [30]:
# Example data preparation
# Assuming X and y are your feature matrix and target variable
# X, y = your_data_preparation_function()

# Split data into training and test sets (example)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit models and determine if a feature is selected (1) or not (0)

# Lasso
lasso = Lasso(alpha=0.01).fit(X_train, y_train)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

# SVM
svm = LinearSVR(C=0.01, epsilon=0.1).fit(X_train, y_train)
svm_selected = (np.abs(svm.coef_) > 0).astype(int)

# Gradient Boosting
gb = GradientBoostingRegressor().fit(X_train, y_train)
gb_selected = (gb.feature_importances_ > 0).astype(int)

# Random Forest
rf = RandomForestRegressor().fit(X_train, y_train)
rf_selected = (rf.feature_importances_ > 0).astype(int)

# Linear Regression
linear_reg = LinearRegression().fit(X_train, y_train)
linear_reg_selected = (np.abs(linear_reg.coef_) > 0).astype(int)

# Ridge Regression
ridge = Ridge(alpha=1.0).fit(X_train, y_train)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

# Create a DataFrame to store results
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected,
    'SVM': svm_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'LinearRegression': linear_reg_selected,
    'Ridge': ridge_selected
})

# Sum the number of selections for each feature
selection_df['Sum'] = selection_df[['Lasso', 'SVM', 'GradientBoost', 'RandomForest', 'LinearRegression', 'Ridge']].sum(axis=1)

# Output the results
print(selection_df)


                     Feature  Lasso  SVM  GradientBoost  RandomForest  \
0                 Listing ID      1    1              1             1   
1         Host Response Rate      0    1              1             1   
2                   Latitude      0    1              1             1   
3                  Longitude      0    1              1             1   
4                Accomodates      1    1              1             1   
..                       ...    ...  ...            ...           ...   
99    Beds Binned_(2.0, 3.0]      0    1              1             1   
100   Beds Binned_(3.5, 4.0]      0    1              1             1   
101  Beds Binned_(4.0, 16.0]      0    1              1             1   
102    neighbourhood_Encoded      1    1              1             1   
103    Property Type_Encoded      1    1              1             1   

     LinearRegression  Ridge  Sum  
0                   1      1    6  
1                   1      1    5  
2              

In [31]:
import pandas as pd

# Assuming selection_df is your DataFrame with feature selections

# Define the thresholds to test
thresholds = [1, 2, 3, 4, 5, 6]

# Create a dictionary to store the count of features for each threshold
count_dict = {}

for threshold in thresholds:
    selected_features = selection_df[selection_df['Sum'] >= threshold]['Feature']
    count_dict[threshold] = len(selected_features)

# Convert the dictionary to a DataFrame for better visualization
count_df = pd.DataFrame(list(count_dict.items()), columns=['Threshold', 'Feature Count'])

# Print the results
print(count_df)

   Threshold  Feature Count
0          1            104
1          2            104
2          3            104
3          4            104
4          5             64
5          6             24


In [35]:
 #Selecting variables with a sum of selections >= 5
final_var = selection_df[selection_df['Sum'] >=5 ]
final_var

Unnamed: 0,Feature,Lasso,SVM,GradientBoost,RandomForest,LinearRegression,Ridge,Sum
0,Listing ID,1,1,1,1,1,1,6
1,Host Response Rate,0,1,1,1,1,1,5
2,Latitude,0,1,1,1,1,1,5
3,Longitude,0,1,1,1,1,1,5
4,Accomodates,1,1,1,1,1,1,6
...,...,...,...,...,...,...,...,...
99,"Beds Binned_(2.0, 3.0]",0,1,1,1,1,1,5
100,"Beds Binned_(3.5, 4.0]",0,1,1,1,1,1,5
101,"Beds Binned_(4.0, 16.0]",0,1,1,1,1,1,5
102,neighbourhood_Encoded,1,1,1,1,1,1,6


In [37]:

final_var = selection_df[selection_df['Sum'] >=5 ]['Feature'].tolist()
final_var

['Listing ID',
 'Host Response Rate',
 'Latitude',
 'Longitude',
 'Accomodates',
 'Bathrooms',
 'Bedrooms',
 'Beds',
 'Guests Included',
 'Min Nights',
 'Reviews',
 'Overall Rating',
 'Accuracy Rating',
 'Cleanliness Rating',
 'Communication Rating',
 'Location Rating',
 'Value Rating',
 'Is Superhost_ind',
 'Distance_to_Center',
 'Location Rating_Avg',
 'Attractive_Neighbourhood',
 'Days Since Host Joined',
 'Days Since First Joined',
 'Days Since Last Joined',
 'Review Range (Days)',
 'Reviews per Day Ratio',
 'time_diff_reviews',
 'avg_polarity_all',
 'avg_polarity_top_5',
 'Host Response Time_Missing',
 'Host Response Time_within an hour',
 'Neighborhood Group_Friedrichshain-Kreuzberg',
 'Neighborhood Group_Marzahn - Hellersdorf',
 'Neighborhood Group_Mitte',
 'Neighborhood Group_Reinickendorf',
 'Room Type_Entire home/apt',
 'Room Type_Private room',
 'Room Type_Shared room',
 'Top10Neighbourhood_Mitte',
 'Top10Neighbourhood_Other',
 'Top10Neighbourhood_SchÃ¶neberg',
 'Top10Neighb

In [39]:
df_model = feature_sele_df[final_var].copy()
## add our Y 
df_model['Price'] = feature_sele_df['Price'].copy()

In [40]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14250 entries, 0 to 14249
Data columns (total 65 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Listing ID                                   14250 non-null  float64
 1   Host Response Rate                           14250 non-null  float64
 2   Latitude                                     14250 non-null  float64
 3   Longitude                                    14250 non-null  float64
 4   Accomodates                                  14250 non-null  float64
 5   Bathrooms                                    14250 non-null  float64
 6   Bedrooms                                     14250 non-null  float64
 7   Beds                                         14250 non-null  float64
 8   Guests Included                              14250 non-null  float64
 9   Min Nights                                   14250 non-null  float64
 10

In [41]:
df_model.to_pickle('step_5_df.pkl')