In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

#### **Utility Functions**

In [2]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

#### **Data Import**

In [3]:
usecols = [
            'neighbourhood_group',
            'room_type',
            'latitude',
            'longitude',
            'price',
            'minimum_nights',
            'number_of_reviews',
            'reviews_per_month',
            'calculated_host_listings_count',
            'availability_365'
]

In [4]:
data = 'AB_NYC_2019.csv'

df = pd.read_csv(data, usecols=usecols)

df = df.fillna(value=0)

#### **Question 1**

What is the most frequent observation (mode) for the column 'neighbourhood_group'?

* Bronx
* Brooklyn
* Manhattan
* Queens

In [5]:
df.neighbourhood_group.describe()

count         48895
unique            5
top       Manhattan
freq          21661
Name: neighbourhood_group, dtype: object

In [6]:
df.neighbourhood_group.value_counts()

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

#### **Split the data**

* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value ('price') is not in your dataframe.

In [7]:
df_train_val, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_val, test_size=0.25, random_state=42)

In [8]:
len(df_train)/len(df), len(df_val)/len(df), len(df_test)/len(df)

(0.6, 0.2, 0.2)

In [9]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [10]:
y_train = df_train['price'].values
y_val = df_val['price'].values
y_test = df_test['price'].values

In [11]:
del df_train['price']
del df_val['price']
del df_test['price']

#### **Question 2**

* Create the correlation matrix for the numerical features of your train dataset.
* In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

**What are the two features that have the biggest correlation in this dataset?**

In [12]:
categorical = list(df_train.dtypes[df_train.dtypes == 'object'].index)
numerical = list(df_train.dtypes[df_train.dtypes != 'object'].index)

In [13]:
categorical

['neighbourhood_group', 'room_type']

In [14]:
numerical

['latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [15]:
corr_df_train = df_train[numerical].corr()
corr_df_train

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


In [16]:
pairs = [
    ['calculated_host_listings_count', 'availability_365'],
    ['number_of_reviews', 'availability_365'],
    ['number_of_reviews', 'reviews_per_month'],
    ['minimum_nights', 'calculated_host_listings_count']
]

In [17]:
for x, y in pairs:
    print('corr', df_train[x].corr(df_train[y]), x, y)

corr 0.22591308547640682 calculated_host_listings_count availability_365
corr 0.1744771171658888 number_of_reviews availability_365
corr 0.5903739015971664 number_of_reviews reviews_per_month
corr 0.11864675413775481 minimum_nights calculated_host_listings_count


In [18]:
corr_df_train.abs().unstack().sort_values(ascending=False).drop_duplicates()

latitude                        latitude                          1.000000
number_of_reviews               reviews_per_month                 0.590374
availability_365                calculated_host_listings_count    0.225913
                                number_of_reviews                 0.174477
                                reviews_per_month                 0.165376
                                minimum_nights                    0.138901
longitude                       reviews_per_month                 0.134642
minimum_nights                  reviews_per_month                 0.120703
                                calculated_host_listings_count    0.118647
calculated_host_listings_count  longitude                         0.117041
availability_365                longitude                         0.083666
longitude                       latitude                          0.080301
number_of_reviews               minimum_nights                    0.076020
calculated_host_listings_

#### **Make price binary**

* We need to turn the price variable from numeric into binary.
* Let's create a variable above_average which is 1 if the price is above (or equal to) 152.

In [19]:
df_train_val_binary, df_test_binary = train_test_split(df, test_size=0.2, random_state=42)
df_train_binary, df_val_binary = train_test_split(df_train_val_binary, test_size=0.25, random_state=42)

In [20]:
df_train_binary = df_train_binary.reset_index(drop=True)
df_val_binary = df_val_binary.reset_index(drop=True)
df_test_binary = df_test_binary.reset_index(drop=True)

In [21]:
df_train_binary['above_average'] = (df_train_binary['price'] >= 152).astype(int)
df_val_binary['above_average'] = (df_val_binary['price'] >= 152).astype(int)
df_test_binary['above_average'] = (df_test_binary['price'] >= 152).astype(int)

In [22]:
y_train_binary = df_train_binary['above_average'].values
y_val_binary = df_val_binary['above_average'].values
y_test_binary = df_test_binary['above_average'].values

#### **Question 3**

* Calculate the mutual information score with the (binarized) price for the two categorical variables that we have. Use the training set only.
* Which of these two variables has bigger score?
* Round it to 2 decimal digits using round(score, 2)

In [23]:
df_train_binary[['neighbourhood_group', 'room_type', 'above_average']].head()

Unnamed: 0,neighbourhood_group,room_type,above_average
0,Brooklyn,Entire home/apt,0
1,Manhattan,Private room,0
2,Bronx,Entire home/apt,0
3,Brooklyn,Entire home/apt,0
4,Manhattan,Private room,0


In [24]:
round( mutual_info_score(df_train_binary['above_average'], df_train_binary['neighbourhood_group']), 2)

0.05

In [25]:
round( mutual_info_score(df_train_binary['above_average'], df_train_binary['room_type']), 2)

0.14

#### **Question 4**

Now let's train a logistic regression

Remember that we have two categorical variables in the data. Include them using one-hot encoding.

Fit the model on the training dataset. To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters: model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)

Calculate the accuracy on the validation dataset and rount it to 2 decimal digits.

In [26]:
usecols = [
            'neighbourhood_group',
            'room_type',
            'latitude',
            'longitude',
            'price',
            'minimum_nights',
            'number_of_reviews',
            'reviews_per_month',
            'calculated_host_listings_count',
            'availability_365'
]

In [27]:
data = 'AB_NYC_2019.csv'

df = pd.read_csv(data, usecols=usecols)

df = df.fillna(value=0)

In [28]:
df['above_average'] = (df['price'] >= 152).astype(int)

In [29]:
df['above_average'].value_counts()

0    33992
1    14903
Name: above_average, dtype: int64

In [30]:
del df['price']

In [31]:
df_train_val, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_val, test_size=0.25, random_state=42)

In [32]:
len(df_train)/len(df), len(df_val)/len(df), len(df_test)/len(df)

(0.6, 0.2, 0.2)

In [33]:
y_train = df_train['above_average'].values
y_val = df_val['above_average'].values
y_test = df_test['above_average'].values

In [34]:
del df['above_average']
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [35]:
list(df_train.columns)

['neighbourhood_group',
 'latitude',
 'longitude',
 'room_type',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [36]:
categorical = list(df_train.dtypes[df_train.dtypes == 'object'].index)
categorical

['neighbourhood_group', 'room_type']

In [37]:
numerical = list(df_train.dtypes[df_train.dtypes != 'object'].index)
numerical

['latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [38]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=2000)

model.fit(X_train, y_train)

LogisticRegression(max_iter=2000, random_state=42)

In [39]:
model.coef_.shape, model.intercept_.shape

((1, 15), (1,))

In [40]:
model.coef_[0]

array([ 3.04061394e-03,  3.58497110e-03, -5.81251250e+00, -3.16273800e+00,
       -1.13452732e-02, -1.58959693e-01,  1.59153318e-01,  1.60900299e+00,
        8.89348001e-03, -1.67633362e+00, -3.23996041e-03, -4.20888010e-02,
        1.96379985e+00, -8.09282798e-01, -1.21276058e+00])

In [41]:
model.intercept_[0]

-0.061572348908082344

In [42]:
model.predict(X_val)

array([0, 1, 0, ..., 0, 0, 1])

In [43]:
model.predict_proba(X_val).shape

(9779, 2)

In [44]:
y_pred = model.predict_proba(X_val)[:,1]

In [45]:
decision_threshold = (y_pred >= 0.5).astype(int)

In [46]:
accuracy_full = (y_val == decision_threshold).mean()
print( round(accuracy_full, 2) )

0.79


#### **Question 5**  
* We have 9 features: 7 numerical features and 2 categorical.
* Let's find the least useful one using the feature elimination technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
* Which of following feature has the smallest difference?
      * neighbourhood_group
      * room_type
      * number_of_reviews
      * reviews_per_month

note: the difference doesn't have to be positive

In [47]:
usecols = [
            'neighbourhood_group',
            'room_type',
            'latitude',
            'longitude',
            'price',
            'minimum_nights',
            'number_of_reviews',
            'reviews_per_month',
            'calculated_host_listings_count',
            'availability_365'
]

In [48]:
data = 'AB_NYC_2019.csv'

df = pd.read_csv(data, usecols=usecols)

df = df.fillna(value=0)

df['above_average'] = (df['price'] >= 152).astype(int)

del df['price']

df_train_val, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_val, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['above_average'].values
y_val = df_val['above_average'].values
y_test = df_test['above_average'].values

del df['above_average']
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [49]:
cols = list(df.columns)
cols

['neighbourhood_group',
 'latitude',
 'longitude',
 'room_type',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [50]:
# accuracy_full_model = 0.7906738930360978
accuracy_full = 0.7906738930360978

In [51]:
for i in range(len(cols)):
    exclude = cols[i]
    features = cols.copy()
    del features[i]
    print('exclude: ', exclude)

    dv = DictVectorizer(sparse=False)

    train_dict = df_train[features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[features].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=2000)

    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:,1]

    # decision_threshold = (y_pred >= 0.5)
    decision_threshold = (y_pred >= 0.5).astype(int)

    accuracy_elim = (y_val == decision_threshold).mean()
    print('accuracy_full:', accuracy_full)
    print('accuracy_elim:', accuracy_elim)
    print('diffence:', abs(accuracy_elim - accuracy_full))
    print()

exclude:  neighbourhood_group
accuracy_full: 0.7906738930360978
accuracy_elim: 0.7509970344616014
diffence: 0.03967685857449643

exclude:  latitude
accuracy_full: 0.7906738930360978
accuracy_elim: 0.7867880151344718
diffence: 0.003885877901625978

exclude:  longitude
accuracy_full: 0.7906738930360978
accuracy_elim: 0.7868902750792515
diffence: 0.0037836179568463413

exclude:  room_type
accuracy_full: 0.7906738930360978
accuracy_elim: 0.7289088863892014
diffence: 0.061765006646896436

exclude:  minimum_nights
accuracy_full: 0.7906738930360978
accuracy_elim: 0.7904693731465385
diffence: 0.0002045198895592737

exclude:  number_of_reviews
accuracy_full: 0.7906738930360978
accuracy_elim: 0.7913897126495552
diffence: 0.0007158196134573469

exclude:  reviews_per_month
accuracy_full: 0.7906738930360978
accuracy_elim: 0.79006033336742
diffence: 0.0006135596686778211

exclude:  calculated_host_listings_count
accuracy_full: 0.7906738930360978
accuracy_elim: 0.7894467736987422
diffence: 0.00122711

#### **Question 6**

* For this question, we'll see how to use a linear regression model from Scikit-Learn
* We'll need to use the original column 'price'. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data.
* This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
* Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

If there are multiple options, select the smallest alpha.

In [52]:
usecols = [
            'neighbourhood_group',
            'room_type',
            'latitude',
            'longitude',
            'price',
            'minimum_nights',
            'number_of_reviews',
            'reviews_per_month',
            'calculated_host_listings_count',
            'availability_365'
]

In [53]:
data = 'AB_NYC_2019.csv'

df = pd.read_csv(data, usecols=usecols)

df = df.fillna(value=0)

df.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,Manhattan,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,Manhattan,40.80902,-73.9419,Private room,150,3,0,0.0,1,365
3,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0


In [54]:
df_train_val, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_val, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

del df_train['price']
del df_val['price']
del df_test['price']

In [55]:
for alpha in [0, 0.01, 0.1, 1, 10]:

    rr = Ridge(alpha=alpha)

    dv = DictVectorizer(sparse=False)

    train_dict = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    rr.fit(X_train, y_train)

    y_pred = rr.predict(X_val)

    val_score = round(rmse(y_pred, y_val), 3)
    print(rmse(y_pred, y_val),'score:', val_score, 'alpha:', alpha)

0.4971049691895412 score: 0.497 alpha: 0
0.4971173046190633 score: 0.497 alpha: 0.01
0.49711832446944026 score: 0.497 alpha: 0.1
0.49713953633200514 score: 0.497 alpha: 1
0.49788660158765535 score: 0.498 alpha: 10
