In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

In [3]:
feature_sele_df = pd.read_pickle('step_4_df.pkl')

# Feature Selection Script

## Overview
This script performs feature selection on a dataset to determine which features are most important for predicting a target variable, `Price`. It utilizes multiple regression models to evaluate the significance of each feature and selects those that meet certain criteria.

## Libraries Used
- **Pandas**: Data manipulation and analysis.
- **NumPy**: Numerical operations.
- **Seaborn**: Data visualization.
- **Scikit-Learn**: Machine learning models and feature selection.

## Script Breakdown

### 1. Data Loading
- Load the dataset from `step_4_df.pkl`.

### 2. Feature Selection
- **Data Preparation**: 
  - Define feature matrix `X` and target variable `y`.
  - Encode string columns using `OrdinalEncoder`.
- **Model Fitting**: 
  - Fit various regression models (Lasso, SVM, Gradient Boosting, Random Forest, Linear Regression, Ridge) on the data.
  - Determine if a feature is selected (1) or not (0) by each model.
- **Results Compilation**:
  - Create a DataFrame `selection_df` to store feature selection results from all models.
  - Calculate the sum of selections for each feature across all models.

### 3. Feature Count by Threshold
- Define thresholds to test and count the number of features selected for each threshold.

### 4. Final Feature Selection
- Select features with a sum of selections greater than or equal to 4.
- Save the final selected features into a new DataFrame `df_model`.




In [5]:

# Example data preparation
# Assuming X and y are your feature matrix and target variable
# X, y = your_data_preparation_function()
X = feature_sele_df.drop('Price', axis=1)  # Features

y = feature_sele_df['Price']  # Target variable
X.shape
#X.info()

(12583, 59)

In [105]:
string_columns = X.select_dtypes(include=['string']).columns

# Initialize OrdinalEncoder
encoder = OrdinalEncoder()

# Apply OrdinalEncoder to each string column
for col in string_columns:
    # Fit and transform the column using OrdinalEncoder
    X[col] = encoder.fit_transform(X[[col]])

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12583 entries, 0 to 12582
Data columns (total 59 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Listing ID                    12583 non-null  float64
 1   Host Response Time            12583 non-null  float64
 2   Host Response Rate            12583 non-null  float64
 3   neighbourhood                 12583 non-null  float64
 4   Neighborhood Group            12583 non-null  float64
 5   Latitude                      12583 non-null  float64
 6   Longitude                     12583 non-null  float64
 7   Property Type                 12583 non-null  float64
 8   Room Type                     12583 non-null  float64
 9   Accomodates                   12583 non-null  float64
 10  Bathrooms                     12583 non-null  float64
 11  Bedrooms                      12583 non-null  float64
 12  Beds                          12583 non-null  float64
 13  G

In [106]:

# Fit models and determine if a feature is selected (1) or not (0)

# Lasso
lasso = Lasso(alpha=0.01).fit(X, y)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

# SVM
svm = LinearSVR(C=0.01, epsilon=0.1).fit(X, y)
svm_selected = (np.abs(svm.coef_) > 0).astype(int)

# Gradient Boosting
gb = GradientBoostingRegressor().fit(X, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)

# Random Forest
rf = RandomForestRegressor().fit(X, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)

# Linear Regression
linear_reg = LinearRegression().fit(X, y)
linear_reg_selected = (np.abs(linear_reg.coef_) > 0).astype(int)

# Ridge Regression
ridge = Ridge(alpha=1.0).fit(X, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

# Create a DataFrame to store results
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected,
    'SVM': svm_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'LinearRegression': linear_reg_selected,
    'Ridge': ridge_selected
})

# Sum the number of selections for each feature
selection_df['Sum'] = selection_df[['Lasso', 'SVM', 'GradientBoost', 'RandomForest', 'LinearRegression', 'Ridge']].sum(axis=1)

# Output the results
print(selection_df)


                         Feature  Lasso  SVM  GradientBoost  RandomForest  \
0                     Listing ID      1    1              1             1   
1             Host Response Time      0    1              1             1   
2             Host Response Rate      1    1              1             1   
3                  neighbourhood      1    1              1             1   
4             Neighborhood Group      1    1              1             1   
5                       Latitude      1    1              1             1   
6                      Longitude      1    1              1             1   
7                  Property Type      0    0              0             0   
8                      Room Type      1    1              1             1   
9                    Accomodates      1    1              1             1   
10                     Bathrooms      1    1              1             1   
11                      Bedrooms      1    1              1             1   

In [107]:
# Define the thresholds to test
thresholds = [1, 2, 3, 4, 5, 6]

# Create a dictionary to store the count of features for each threshold
count_dict = {}

for threshold in thresholds:
    selected_features = selection_df[selection_df['Sum'] >= threshold]['Feature']
    count_dict[threshold] = len(selected_features)

# Convert the dictionary to a DataFrame for better visualization
count_df = pd.DataFrame(list(count_dict.items()), columns=['Threshold', 'Feature Count'])

# Print the results
print(count_df)

   Threshold  Feature Count
0          1             59
1          2             57
2          3             57
3          4             57
4          5             54
5          6             41


In [108]:
 #Selecting variables with a sum of selections >= 4
final_var = selection_df[selection_df['Sum'] >=4 ]
final_var

Unnamed: 0,Feature,Lasso,SVM,GradientBoost,RandomForest,LinearRegression,Ridge,Sum
0,Listing ID,1,1,1,1,1,1,6
1,Host Response Time,0,1,1,1,1,1,5
2,Host Response Rate,1,1,1,1,1,1,6
3,neighbourhood,1,1,1,1,1,1,6
4,Neighborhood Group,1,1,1,1,1,1,6
5,Latitude,1,1,1,1,1,1,6
6,Longitude,1,1,1,1,1,1,6
8,Room Type,1,1,1,1,1,1,6
9,Accomodates,1,1,1,1,1,1,6
10,Bathrooms,1,1,1,1,1,1,6


In [113]:
final_var = selection_df[selection_df['Sum'] >=4 ]['Feature'].tolist()
final_var

['Listing ID',
 'Host Response Time',
 'Host Response Rate',
 'neighbourhood',
 'Neighborhood Group',
 'Latitude',
 'Longitude',
 'Room Type',
 'Accomodates',
 'Bathrooms',
 'Bedrooms',
 'Beds',
 'Guests Included',
 'Min Nights',
 'Reviews',
 'Overall Rating',
 'Accuracy Rating',
 'Cleanliness Rating',
 'Checkin Rating',
 'Communication Rating',
 'Location Rating',
 'Value Rating',
 'Is Superhost_ind',
 'Is Exact Location_ind',
 'Top10Neighbourhood',
 'Host Response Rate_missing',
 'Checkin Rating_missing',
 'Value Rating_missing',
 'Location Rating_missing',
 'Accuracy Rating_missing',
 'Communication Rating_missing',
 'Cleanliness Rating_missing',
 'Overall Rating_missing',
 'Distance_to_Center',
 'Location Rating_Avg',
 'Attractive_Neighbourhood',
 'Days Since Host Joined',
 'Host Missing Indicator',
 'Days Since First Joined',
 'First Missing Indicator',
 'Days Since Last Joined',
 'Last Missing Indicator',
 'Review Range (Days)',
 'Reviews per Day Ratio',
 'Bedrooms Binned',
 'Gue

In [120]:
df_model = feature_sele_df[final_var].copy()
df_model['Price'] = feature_sele_df['Price'].copy()

In [121]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12583 entries, 0 to 12582
Data columns (total 58 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Listing ID                    12583 non-null  float64
 1   Host Response Time            12583 non-null  string 
 2   Host Response Rate            12583 non-null  float64
 3   neighbourhood                 12583 non-null  string 
 4   Neighborhood Group            12583 non-null  string 
 5   Latitude                      12583 non-null  float64
 6   Longitude                     12583 non-null  float64
 7   Room Type                     12583 non-null  string 
 8   Accomodates                   12583 non-null  float64
 9   Bathrooms                     12583 non-null  float64
 10  Bedrooms                      12583 non-null  float64
 11  Beds                          12583 non-null  float64
 12  Guests Included               12583 non-null  float64
 13  M

In [122]:
df_model

Unnamed: 0,Listing ID,Host Response Time,Host Response Rate,neighbourhood,Neighborhood Group,Latitude,Longitude,Room Type,Accomodates,Bathrooms,...,Accomodates Binned,Beds Binned,Latitude Binned,Longitude Binned,time_diff_reviews,avg_polarity_all,avg_sentiment_all,avg_polarity_top_5,avg_sentiment_top_5,Price
0,13761071.0,within a few hours,1.0,Mitte,Mitte,52.50920,13.41762,Entire home/apt,6.0,1.0,...,"(5.0, 16.0]","(3.5, 12.0]",2,2,8.0,0.346960,1.0,0.310000,1.0,109.0
1,13763834.0,Missing,-2.0,NeukÃ¶lln,NeukÃ¶lln,52.46713,13.42733,Entire home/apt,2.0,1.0,...,"(1.0, 2.0]","(1.0, 2.0]",1,2,28.0,0.368619,1.0,0.194133,1.0,50.0
2,13764102.0,Missing,-2.0,Mitte,Mitte,52.53294,13.40649,Private room,2.0,1.0,...,"(1.0, 2.0]","(0.0, 1.0]",3,2,-2.0,0.000000,0.0,0.000000,0.0,69.0
3,13764166.0,Missing,-2.0,Charlottenburg,Charlottenburg-Wilm.,52.52670,13.30916,Private room,2.0,1.0,...,"(1.0, 2.0]","(0.0, 1.0]",2,1,-2.0,0.000000,0.0,0.000000,0.0,30.0
4,13765505.0,Missing,-2.0,Prenzlauer Berg,Pankow,52.53924,13.43521,Private room,1.0,1.0,...,"(0.499, 1.0]","(0.0, 1.0]",3,2,13.0,0.381355,1.0,0.385260,1.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12578,9991497.0,Missing,-2.0,NiederschÃ¶neweide,Treptow - KÃ¶penick,52.45494,13.51423,Entire home/apt,6.0,1.0,...,"(5.0, 16.0]","(1.0, 2.0]",1,3,0.0,0.000000,0.0,0.000000,0.0,50.0
12579,9993168.0,Missing,-2.0,Rummelsburg,Lichtenberg,52.50099,13.49185,Private room,2.0,1.0,...,"(1.0, 2.0]","(0.0, 1.0]",2,2,6.0,0.295728,1.0,0.334960,1.0,20.0
12580,9994644.0,within an hour,1.0,Kreuzberg,Friedrichshain-Kreuzberg,52.50283,13.37799,Entire home/apt,4.0,1.0,...,"(3.0, 4.0]","(2.0, 3.0]",2,2,3.0,0.392852,1.0,0.536611,1.0,60.0
12581,999465.0,Missing,-2.0,Friedrichshain,Friedrichshain-Kreuzberg,52.51001,13.45366,Entire home/apt,9.0,1.5,...,"(5.0, 16.0]","(3.5, 12.0]",2,2,109.0,0.250186,1.0,0.250186,1.0,150.0


In [119]:
df_model.to_pickle('step_5_df.pkl')