Two Sigma Connect: Rental Listing Inquiries 
===
---

 - This classification model predicts the degree of popularity for a rental listing based on listings' attribute such as the number of rooms, location, price, etc.  
 - It predicts whether a given listing would receive "low," "medium," or
   "high" interest with its corresponding probability to a particular listing.

---
**Multiclass Classifier with Probability Estimates**
---
As target variable Interest level has 3 values. This dataset is a good candidate for multiclass classifier. 

**Datasets**
---
NYC rent listing data from the rental website RentHop which is used to find the desired home.
Data is freely available for non commercial use at See https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/data

In [1]:
# import libraries

import collections
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn import preprocessing
from scipy import stats
from sklearn.preprocessing import StandardScaler

In [2]:
# import data

data_path = r"C:\Users\kdeodhar\twoSigma\input"
data_path

'C:\\Users\\kdeodhar\\twoSigma\\input'

In [3]:
# seperate split training dataframe in X and y
# X has all the fetures
# y has target variable

train_file = data_path + r"\train.json"
train = pd.read_json(train_file)
train['interest_level'] = train['interest_level'].astype('category')
train['interest_level'] = train['interest_level'].cat.codes
X = train.drop('interest_level', axis=1)
y = train['interest_level']
y.columns = ['interest_level']
print("Training data loaded")

Training data loaded


In [4]:
# preprocess X (training data)
# I am keeping only numerical variables for this run

X['created'] = pd.to_datetime(X['created'])
X['year'] = X['created'].dt.year
X['month'] = X['created'].dt.month
X['day'] = X['created'].dt.day

num_photos = []
for photo in X['photos']:
    num_photos.append(len(photo))
num_photos_df = pd.DataFrame(num_photos, index=X.index, columns=['num_photos'])
X = X.assign(num_photos=num_photos_df)

len_description = []
for description in X['description']:
    len_description.append(len(description))
len_description_df = pd.DataFrame(len_description, index=X.index, columns=['len_description'])
X = X.assign(len_description=len_description_df)

num_features = []
for feature in X['features']:
    num_features.append(len(feature))
num_features_df = pd.DataFrame(num_features, index=X.index, columns=['num_features'])
X = X.assign(num_features=num_features_df)

X = X.drop('building_id', axis=1)
X = X.drop('created', axis=1)
X = X.drop('description', axis=1)
X = X.drop('display_address', axis=1)
X = X.drop('features', axis=1)
X = X.drop('manager_id', axis=1)
X = X.drop('photos', axis=1)
X = X.drop('street_address', axis=1)
X = X.drop('listing_id', axis=1)


In [5]:
# split the dataset 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [7]:
# convert dataset to arrays.

X_train = np.asarray(X_train, dtype='int')
y_train = np.asarray(y_train, dtype='int')
X_test = np.asarray(X_test, dtype='int')
y_test = np.asarray(y_test, dtype='int')

In [8]:
# Feature Scaling

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Fitting Random Forest Classification to the Training set

classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [10]:
# Predicting the Test set results

y_pred = classifier.predict(X_test)

In [11]:
# Evaluating the Algorithm

confusion_matrix(y_test,y_pred)
print(classification_report(y_test,y_pred))
accuracy_score(y_test, y_pred)

array([[ 173,  514,  254],
       [  83, 8001,  524],
       [ 115, 2109,  565]], dtype=int64)

             precision    recall  f1-score   support

          0       0.47      0.18      0.26       941
          1       0.75      0.93      0.83      8608
          2       0.42      0.20      0.27      2789

avg / total       0.66      0.71      0.66     12338



0.7082995623277679

In [12]:
# tune the hyperparameters

from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [1, 223, 445, 667, 889, 1111, 1333, 1555, 1777, 2000]}


In [13]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 10 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 10, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(X_train, y_train);

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  7.5min finished


In [14]:
# find the best parameters

rf_random.best_params_

{'n_estimators': 1111,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [15]:
# apply best parameters to training set and rerun

classifier = RandomForestClassifier(n_estimators = 1111, min_samples_split = 2, min_samples_leaf = 4, max_features= 'auto',
                                    max_depth = 100, bootstrap = True,
                                    criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# Evaluating the Algorithm
confusion_matrix(y_test,y_pred)
print(classification_report(y_test,y_pred))
accuracy_score(y_test, y_pred)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=100, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1111, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

array([[ 140,  610,  191],
       [  57, 8258,  293],
       [  80, 2328,  381]], dtype=int64)

             precision    recall  f1-score   support

          0       0.51      0.15      0.23       941
          1       0.74      0.96      0.83      8608
          2       0.44      0.14      0.21      2789

avg / total       0.65      0.71      0.65     12338



0.7115415788620522

By using best fit parameter we improved the model by 1%

In [16]:
#apply model to Real test Set

test_file = data_path + r"\test.json"
X_test2 = pd.read_json(test_file)
print("Test dataset loaded")

type(X_test2)

Test dataset loaded


pandas.core.frame.DataFrame

In [17]:
# preprocess Real test data
X_test2['created'] = pd.to_datetime(X_test2['created'])
X_test2['year'] = X_test2['created'].dt.year
X_test2['month'] = X_test2['created'].dt.month
X_test2['day'] = X_test2['created'].dt.day

num_photos = []
for photo in X_test2['photos']:
    num_photos.append(len(photo))
num_photos_df = pd.DataFrame(num_photos, index=X_test2.index, columns=['num_photos'])
X_test2 = X_test2.assign(num_photos=num_photos_df)

len_description = []
for description in X_test2['description']:
    len_description.append(len(description))
len_description_df = pd.DataFrame(len_description, index=X_test2.index, columns=['len_description'])
X_test2 = X_test2.assign(len_description=len_description_df)

num_features = []
for feature in X_test2['features']:
    num_features.append(len(feature))
num_features_df = pd.DataFrame(num_features, index=X_test2.index, columns=['num_features'])
X_test2 = X_test2.assign(num_features=num_features_df)

X_test2 = X_test2.drop('building_id', axis=1)
X_test2 = X_test2.drop('created', axis=1)
X_test2 = X_test2.drop('description', axis=1)
X_test2 = X_test2.drop('display_address', axis=1)
X_test2 = X_test2.drop('features', axis=1)
X_test2 = X_test2.drop('manager_id', axis=1)
X_test2 = X_test2.drop('photos', axis=1)
X_test2 = X_test2.drop('street_address', axis=1)
X_test2 = X_test2.drop('listing_id', axis=1)

In [18]:
# convert data into array

X_test2 = np.asarray(X_test2, dtype='int')
X_test2 = scaler.transform(X_test2)

In [20]:
# here are predictions 

y_pred = classifier.predict(X_test2)