# Training SVM Model

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, LinearSVC

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
trainCleaned = "dataset/two-sigma-connect-rental-listing-inquiries/trainTextExtract.json.zip"
trainData = pd.read_json(trainCleaned, convert_dates=['created'])
trainData.head(5)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,desc_word_count,desc_char_count,desc_avg_word_length,desc_special_char_count,desc_num_count,desc_upper_count,num_features,year,month,day
0,2.0,3,b66d3340ba269e6f184bfad550a6f05f,2016-06-24 07:55:11,welcome new home extravagantly outfitted 3 bed...,president street,"[Dining Room, Laundry in Building, High Speed ...",low,40.6678,7211226,...,183,1276,6.044199,66,3,2,8,2016,6,24
1,2.0,3,b66d3340ba269e6f184bfad550a6f05f,2016-04-29 03:29:11,cdata3 bedrooms 3100 bedford stuyvesantbushwic...,president street,[],low,40.6678,6941286,...,44,372,7.477273,86,1,3,0,2016,4,29
2,1.0,1,7e9fd0dea8ad2c8ac0a1022a1f0d997b,2016-06-17 01:22:28,newly renovated queen sized 1 bed 1 bath unit ...,east 12th street,"[Pre-War, Dogs Allowed, Cats Allowed]",low,40.7287,7174518,...,32,224,6.225806,30,2,0,3,2016,6,17
3,1.0,3,7e9fd0dea8ad2c8ac0a1022a1f0d997b,2016-06-08 01:12:34,prime east village 3 bedroom 1 bath apartment ...,east 12th street,"[Pre-War, Dogs Allowed, Cats Allowed]",low,40.7287,7122279,...,62,390,5.393443,15,2,1,3,2016,6,8
4,1.0,1,7e9fd0dea8ad2c8ac0a1022a1f0d997b,2016-06-11 01:27:45,welcome next new homethis 1br1ba unit located ...,east 12th street,"[Pre-War, Dogs Allowed, Cats Allowed]",low,40.7287,7138528,...,134,756,4.684211,22,2,2,3,2016,6,11


In [3]:
testDataDir = "dataset/two-sigma-connect-rental-listing-inquiries/testTextExtract.json.zip"
testData = pd.read_json(testDataDir, convert_dates=['created'])
testData.head(5)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,desc_word_count,desc_char_count,desc_avg_word_length,desc_special_char_count,desc_num_count,desc_upper_count,num_features,year,month,day
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,large awesome terraceaccessible via bedroom li...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,...,78,587,6.710526,50,0,4,6,2016,6,11
1,1.0,2,0,2016-06-24 06:36:34,prime soho bleecker houston newly renovated st...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0,...,35,245,6.205882,16,0,1,3,2016,6,24
2,1.0,0,0,2016-06-17 01:23:39,spacious studio prime location cleanbuilding h...,Sullivan Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.726,7174566,-74.0026,...,39,268,6.052632,7,0,0,3,2016,6,17
3,1.0,2,f9c826104b91d868e69bd25746448c0c,2016-06-21 05:06:02,immediate access call bryanbr bond new york re...,Jones Street,"[Hardwood Floors, Dogs Allowed, Cats Allowed]",40.7321,7191391,-74.0028,...,22,146,6.25,12,0,0,3,2016,6,21
4,1.0,1,81062936e12ee5fa6cd2b965698e17d5,2016-06-16 07:24:27,beautiful true 1 bedroom luxury building finan...,Exchange Place,"[Roof Deck, Doorman, Elevator, Fitness Center,...",40.7054,7171695,-74.0095,...,85,564,5.783133,41,1,5,10,2016,6,16


## Loading train data

In [4]:
X = trainData[['bathrooms', 'bedrooms', 'price', 'created', 'desc_word_count', 'desc_avg_word_length', 'features', 'latitude', 'longitude']]

X['num_features'] = X['features'].apply(lambda x: len(x))
X = X.drop('features', axis=1)

X['year'] = X['created'].dt.year
X['month'] = X['created'].dt.month
X['day'] = X['created'].dt.day
X = X.drop('created', axis=1)

X.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,bathrooms,bedrooms,price,desc_word_count,desc_avg_word_length,latitude,longitude,num_features,year,month,day
0,2.0,3,3050,183,6.044199,40.6678,-73.9398,8,2016,6,24
1,2.0,3,3100,44,7.477273,40.6678,-73.9398,0,2016,4,29
2,1.0,1,2850,32,6.225806,40.7287,-73.981,3,2016,6,17
3,1.0,3,4200,62,5.393443,40.7287,-73.981,3,2016,6,8
4,1.0,1,2850,134,4.684211,40.7287,-73.981,3,2016,6,11


## Loading test data

In [5]:
X_test = testData[['bathrooms', 'bedrooms', 'price', 'desc_word_count', 'desc_avg_word_length', 'latitude', 'longitude', 'num_features', 'year', 'month', 'day']]
X_test.head(5)

Unnamed: 0,bathrooms,bedrooms,price,desc_word_count,desc_avg_word_length,latitude,longitude,num_features,year,month,day
0,1.0,1,2950,78,6.710526,40.7185,-73.9865,6,2016,6,11
1,1.0,2,2850,35,6.205882,40.7278,-74.0,3,2016,6,24
2,1.0,0,2295,39,6.052632,40.726,-74.0026,3,2016,6,17
3,1.0,2,2900,22,6.25,40.7321,-74.0028,3,2016,6,21
4,1.0,1,3254,85,5.783133,40.7054,-74.0095,10,2016,6,16


## Encode label for target variable

In [14]:
le=LabelEncoder()
target = trainData[['interest_level']]
y = le.fit_transform(target['interest_level'])
y

array([1, 1, 1, ..., 2, 2, 0])

## Training simple SVC model

In [7]:
model = SVC(kernel='linear', max_iter=1000, probability=True)
model.fit(X, y)

prob = model.predict_proba(X_test)
df = pd.DataFrame(prob)
df.columns = le.classes_
df["listing_id"] = testData.listing_id.values
df.to_csv("SVM.csv", index=False)
df

#total log loss = 0.99 ~ 1.00



Unnamed: 0,high,low,medium,listing_id
0,0.039778,0.692201,0.268021,7142618
1,0.040445,0.691461,0.268094,7210040
2,0.039909,0.691143,0.268947,7174566
3,0.040244,0.691751,0.268004,7191391
4,0.042586,0.690302,0.267112,7171695
...,...,...,...,...
74654,0.042717,0.687626,0.269657,6928108
74655,0.128858,0.727508,0.143634,6906674
74656,0.040294,0.690830,0.268876,6897967
74657,0.103656,0.777193,0.119151,6842183


In [8]:
model.score(X, y)

0.36100545346226115

## Training modified SVC model

In [9]:
model = make_pipeline(
    SimpleImputer(strategy='mean'),
    MinMaxScaler(feature_range=(0,1)),
    SVC(kernel='linear', C=2.0, max_iter=1000, decision_function_shape='ovo', probability=True)
)

model.fit(X, y)

#total log loss = 0.78 ~ 0.79



Pipeline(memory=None,
         steps=[('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('svc',
                 SVC(C=2.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovo', degree=3,
                     gamma='auto_deprecated', kernel='linear', max_iter=1000,
                     probability=True, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [10]:
prob = model.predict_proba(X_test)
df = pd.DataFrame(prob)
df.columns = le.classes_
df["listing_id"] = testData.listing_id.values
df.to_csv("SVM.csv", index=False)
df

Unnamed: 0,high,low,medium,listing_id
0,0.033390,0.705162,0.261448,7142618
1,0.035862,0.702102,0.262036,7210040
2,0.032552,0.704904,0.262544,7174566
3,0.035391,0.702786,0.261823,7191391
4,0.033936,0.704038,0.262025,7171695
...,...,...,...,...
74654,0.035825,0.701861,0.262314,6928108
74655,0.031741,0.705865,0.262394,6906674
74656,0.032392,0.704940,0.262668,6897967
74657,0.028311,0.710548,0.261142,6842183


In [11]:
model.score(X, y)

0.43089225974978235

## Cross validation score

In [17]:
scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
scores



array([0.26554423, 0.33829772, 0.31754282, 0.34028862, 0.30677708])

In [19]:
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
scores



array([0.33619702, 0.3610952 , 0.37809349, 0.44282768, 0.3782514 ])