# Project Demo

This demo is designed to provide some general tips and tricks for the ITDS Fall 2019 project. For full details on the project, please refer to [the project requirements](https://grantmlong.com/teaching/fall2019/project/Project-ITDS-Fall-2019.pdf).

***
This demo uses `scikit-learn`, but by no means are you required to use this or any other particular package.



In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

In [16]:
import helpers

In [2]:
train_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_train.csv', index_col=0)
test_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test1.csv', index_col=0)
submit1_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test2.csv', index_col=0)
submit2_df = pd.read_csv('https://grantmlong.com/data/SE_rents2018_test3.csv', index_col=0)

In [3]:
train_df.sample(5).transpose()

rental_id,7153300,7321309,7165357,7196293,7403512
addr_unit,#3B,#1,PENTHOUSE,#1C,#3R
building_id,745084,518503,20701,330241,380359
bedrooms,2,2,3,1,2
bathrooms,1,2,3,1,1
size_sqft,1078,900,3000,500,0
created_at,2018-05-07 12:43:54,2018-06-29 19:17:43,2018-05-10 13:47:30,2018-05-21 15:52:37,2018-07-25 14:25:48
addr_street,1255 EAST 19 STREET,518 MIDWOOD STREET,149 SPRING STREET,431 CLASSON AVENUE,16 JUDGE STREET
addr_city,Brooklyn,Brooklyn,New York,Brooklyn,Brooklyn
addr_zip,11230,11225,10012,11238,11211
addr_lat,40.6196,40.6603,40.7245,40.6861,40.7134


#### Build training data

In [4]:
# extract usable features
feature_cols = [
    'bedrooms', 'year_built', 'bathrooms', 'min_to_subway', 
    'size_sqft', 'no_fee', 'has_doorman'
]
train_features = train_df[feature_cols] 

# impute missing values with medians
train_features = train_features.fillna(train_features.median(), axis=0)

# construct target vector
train_target = train_df['rent']

#### Fit model

In [11]:
lreg = LinearRegression()
lreg.fit(train_features, train_target)

rf = RandomForestRegressor()
rf.fit(train_features, train_target)




RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

#### Predict and Measure Using Test 1

In [12]:
test_features = test_df[feature_cols] 

# impute missing values with medians
test_features = test_features.fillna(train_features.median(), axis=0)

# construct predictions 
test_df['predicted'] = rf.predict(test_features)

mean_squared_error(test_df['rent'], test_df['predicted'])

2728956.446064181

#### Combine Data, Predict Values for Test 2

In [13]:
master_df = train_df.append(test_df, sort=False)

master_features = master_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
master_target = master_df['rent']

rf.fit(master_features, master_target)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

#### Create Submission File for `test2`

In [15]:
submit1_features = submit1_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
submit1_df['predictions'] = rf.predict(submit1_features)
submit1_df['predictions'].to_csv('sample_submission1.csv', header=True)

submit1_df['fake_rent'] = np.ones(submit1_df['predictions'].shape) * master_target.median()
mean_squared_error(submit1_df['predictions'], submit1_df['fake_rent'])

6987991.73172372

#### Create Submission File for `test3`

In [14]:
submit2_features = submit2_df[feature_cols].fillna(master_df[feature_cols].median(), axis=0)
submit2_df['predictions'] = rf.predict(submit2_features)
submit2_df['predictions'].to_csv('sample_submission2.csv', header=True)

submit2_df['fake_rent'] = np.ones(submit2_df['predictions'].shape) * master_target.median()
mean_squared_error(submit2_df['predictions'], submit2_df['fake_rent'])

5986625.776536009

In [17]:
helpers.get_squared(4)

16