# ML Zoomcamp - Assignment : 3. Machine Learning for Classification

# Importing Python packages

In [1]:
import os
import sys
import wget               # to retrieve data from web server

#import libraries
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

## 3.1 Getting Data Set - New York City Airbnb Open Data

In [2]:
# Data file paths 
DATA_FILE_LOCAL_PATH = "./AB_NYC_2019.csv"
DATA_FILE_DOWNLOAD_URL = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv"

In [3]:
# get current platfrom info
def get_platform():
    platforms = {
        'linux1' : 'Linux',
        'linux2' : 'Linux',
        'darwin' : 'OS X',
        'win32' : 'Windows'
    }
    if sys.platform not in platforms:
        return sys.platform
    
    return platforms[sys.platform]

In [4]:
def download_data_set(download_url, local_file):
    os_in_use  = get_platform()
    if os_in_use == 'Windows':
        if os.path.isfile(DATA_FILE_LOCAL_PATH):
            print("Data file: {} , exists locally. skipping download".format(DATA_FILE_LOCAL_PATH) )
        else:
            print("Data file: {} , doesn't exists locally.".format(DATA_FILE_LOCAL_PATH ))
            print("Downloading from location: {} " .format( DATA_FILE_DOWNLOAD_URL))
            wget.download(DATA_FILE_DOWNLOAD_URL, DATA_FILE_LOCAL_PATH)
            print("\nDownload Complete")
    else:
        print("non windows platfrom testing not done")
    return

In [5]:
download_data_set(DATA_FILE_DOWNLOAD_URL, DATA_FILE_LOCAL_PATH)

Data file: ./AB_NYC_2019.csv , exists locally. skipping download


## 3.2 Data Preparation

Download the data, read it with pandas, Look at the data ,Make column names and values look uniform, Check if all the columns read correctly, Check if the any variable needs any preparation

In [6]:
df = pd.read_csv('AB_NYC_2019.csv')
len(df)

48895

In [7]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


### Filter only selective columns

In [8]:
col_categorical = [
                'neighbourhood_group',
                'room_type',
              ]
col_numerical = [ 
                'latitude',
                'longitude',
                'minimum_nights',
                'number_of_reviews',
                'reviews_per_month',
                'calculated_host_listings_count',
                'availability_365'
              ]

col_predict =  [ 
                'price',
              ]

In [9]:
df_selective = df.copy()
df_selective = df_selective[col_categorical+col_numerical+col_predict]
df_selective.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price
0,Brooklyn,Private room,40.64749,-73.97237,1,9,0.21,6,365,149
1,Manhattan,Entire home/apt,40.75362,-73.98377,1,45,0.38,2,355,225
2,Manhattan,Private room,40.80902,-73.9419,3,0,,1,365,150
3,Brooklyn,Entire home/apt,40.68514,-73.95976,1,270,4.64,1,194,89
4,Manhattan,Entire home/apt,40.79851,-73.94399,10,9,0.1,1,0,80


### Handle missing values

In [10]:
df_selective.isna().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
price                                 0
dtype: int64

In [11]:
df_selective['reviews_per_month'] = df_selective['reviews_per_month'].replace(np.nan, 0)

### Question 1 - What is the most frequent observation (mode) for the column 'neighbourhood_group'?

In [12]:
df_selective['neighbourhood_group'].mode()

0    Manhattan
dtype: object

In [13]:
df_selective['neighbourhood_group'].value_counts(ascending=False)

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

### <font color='Blue'> Question 1 -> Answer : most frequent observation (mode) for the column 'neighbourhood_group' is = Manhattan </font>

## 3.3 Setting up the data for train and validation 

Perform the train/validation/test split with Scikit-Learn

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df_full_train, df_test = train_test_split(df_selective, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [16]:
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [17]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [18]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [19]:
df_train.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Entire home/apt,40.7276,-73.94495,3,29,0.7,13,50
1,Manhattan,Private room,40.70847,-74.00498,1,0,0.0,1,7
2,Bronx,Entire home/apt,40.83149,-73.92766,40,0,0.0,1,0
3,Brooklyn,Entire home/apt,40.66448,-73.99407,2,3,0.08,1,0
4,Manhattan,Private room,40.74118,-74.00012,1,48,1.8,2,67


In [20]:
df_val.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.70239,-73.92931,1,35,1.8,1,52
1,Brooklyn,Entire home/apt,40.68498,-73.96618,14,4,0.11,2,343
2,Brooklyn,Entire home/apt,40.66911,-73.94824,3,153,2.64,1,260
3,Manhattan,Private room,40.79767,-73.96114,3,2,0.02,3,0
4,Manhattan,Entire home/apt,40.76075,-73.99893,30,0,0.0,18,365


In [21]:
df_test.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Entire home/apt,40.64354,-73.97777,3,62,0.71,1,189
1,Queens,Private room,40.70666,-73.90779,21,0,0.0,1,73
2,Manhattan,Private room,40.76116,-73.99016,2,17,0.43,1,0
3,Manhattan,Entire home/apt,40.70763,-74.0105,2,5,1.88,327,272
4,Manhattan,Entire home/apt,40.79658,-73.93287,2,30,0.8,1,30


In [22]:
df_train.shape , df_val.shape, df_test.shape, len(y_train), len(y_val), len(y_test)

((29337, 9), (9779, 9), (9779, 9), 29337, 9779, 9779)

## 3.4 EDA

Check missing values
Look at the target variable (price)
Look at numerical and categorical variables

In [23]:
df_full_train = df_full_train.reset_index(drop=True)

In [24]:
df_full_train.isna().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
price                             0
dtype: int64

In [25]:
df_full_train.isnull().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
price                             0
dtype: int64

In [26]:
def find_correlation_matrix(data_frame, col_exclude):
    df_temp = data_frame.copy()
    matrix = df_temp.drop(col_exclude, axis=1).corr()
    #print(matrix)
    return matrix

In [27]:
col_exclude = ['neighbourhood_group', 'room_type', 'price']
cor_matrix = find_correlation_matrix(df_full_train, col_exclude).corr()
print("Correlation matrix is : ")
cor_matrix                           

Correlation matrix is : 


Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,-0.034831,-0.126195,-0.317978,-0.323578,-0.163631,-0.375647
longitude,-0.034831,1.0,-0.363418,-0.06253,0.045793,-0.476585,-0.178392
minimum_nights,-0.126195,-0.363418,1.0,-0.480405,-0.557511,0.141933,0.010868
number_of_reviews,-0.317978,-0.06253,-0.480405,1.0,0.822455,-0.439972,0.005475
reviews_per_month,-0.323578,0.045793,-0.557511,0.822455,1.0,-0.428652,-0.018693
calculated_host_listings_count,-0.163631,-0.476585,0.141933,-0.439972,-0.428652,1.0,0.18614
availability_365,-0.375647,-0.178392,0.010868,0.005475,-0.018693,0.18614,1.0


### <font color='Blue'>Question 2 -> Answer : Features with biggest correlation : number_of_reviews and reviews_per_month </font>

### Make price binary

In [108]:
def get_brnary_price(y_price, thresold=152):
    y_price_bin = y_price.copy()
    y_price_bin = pd.DataFrame(y_price_bin, columns = ['price'])
    y_price_bin['price'] = y_price_bin.price.apply(lambda x: 1 if x >= thresold else 0)
    y_price_bin = y_price_bin.price.values
    return y_price_bin

In [109]:
y_train_bin = get_brnary_price(y_train, thresold=152)
y_train_bin

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [110]:
from sklearn.metrics import mutual_info_score

In [112]:
mi = mutual_info_score( df_train.neighbourhood_group, y_train_bin)
round(mi,2)

0.05

In [113]:
mi = mutual_info_score(df_train.room_type, y_train_bin)
round(mi,2)

0.14

### <font color='Blue'>Question 3 -> Answer : highest mutual information score for  room_type </font>

In [166]:
from sklearn.feature_extraction import DictVectorizer

In [167]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[col_categorical + col_numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[col_categorical + col_numerical].to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

In [168]:
X_train[0]

array([ 50.     ,  13.     ,  40.7276 , -73.94495,   3.     ,   0.     ,
         1.     ,   0.     ,   0.     ,   0.     ,  29.     ,   0.7    ,
         1.     ,   0.     ,   0.     ])

In [169]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=Bronx',
 'neighbourhood_group=Brooklyn',
 'neighbourhood_group=Manhattan',
 'neighbourhood_group=Queens',
 'neighbourhood_group=Staten Island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=Entire home/apt',
 'room_type=Private room',
 'room_type=Shared room']

In [170]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [171]:
len(y_train) , len(y_val), len(y_test)

(29337, 9779, 9779)

In [172]:
y_train_bin = get_brnary_price(y_train, thresold=152)
y_val_bin = get_brnary_price(y_val, thresold=152)
y_test_bin = get_brnary_price(y_test, thresold=152)

In [173]:
len(y_train_bin) , len(y_val_bin), len(y_test_bin)

(29337, 9779, 9779)

In [174]:
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
model.fit(X_train, y_train_bin)

LogisticRegression(random_state=42, solver='liblinear')

In [175]:
y_val_bin_pred = model.predict(X_val)
y_val_bin_pred

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [177]:
score = model.score(X_val, y_val_bin)
score = round(score,2)
print(score)

0.7907761529808774


### <font color='Blue'>Question 4 -> Answer : accuracy on the validation dataset : 0.79 </font>

In [156]:
def apply_one_hot_encoding(df_data,  columns):
    df_data_cp = df_data.copy()
    dv = DictVectorizer(sparse=False)

    df_data_dict = df_data_cp[columns].to_dict(orient='records')
    X_data = dv.fit_transform(df_data_dict)
    
    return X_data

In [163]:
def model_build_log_reg(X_train_data,y_train_bin_data, X_val_data, y_val_bin_data):
    
    model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
    model.fit(X_train_data, y_train_bin_data)  
    
    y_val_bin_pred = model.predict(X_val_data)
    sub_score = model.score(X_val_data, y_val_bin_data)
    #sub_score = round(sub_score,2)
    return sub_score

In [164]:
#X_train = apply_one_hot_encoding(df_train,col_categorical + col_numerical)

In [225]:
acc_score = []
all_columns =  col_categorical + col_numerical

# Get accuracy score for all columns
X_train = apply_one_hot_encoding(df_train, all_columns) 
X_val = apply_one_hot_encoding(df_val, all_columns)
score = model_build_log_reg(X_train,y_train_bin, X_val, y_val_bin)


for column in all_columns:
    sub_columns = all_columns.copy()
    sub_columns.remove(column)
    #print(sub_columns)
    X_train_data = apply_one_hot_encoding(df_train, sub_columns) 
    X_val_data = apply_one_hot_encoding(df_val, sub_columns)
    sub_score = model_create_fit_predict_score(X_train_data,y_train_bin, X_val_data, y_val_bin)
    acc_score.append(sub_score)
#print(all_columns)

In [183]:
print(acc_score)

[0.7498721750690255, 0.7289088863892014, 0.7867880151344718, 0.7869925350240311, 0.7914919725943348, 0.7911851927599959, 0.7907761529808774, 0.7895490336435218, 0.781470498005931]


In [209]:
df_scores = pd.DataFrame(acc_score, all_columns )
df_scores.columns = ['scores']
df_scores['Diff'] = score - df_scores['scores']
df_scores['Abs_Diff'] = abs(df_scores['Diff'] )
df_scores

Unnamed: 0,scores,Diff,Abs_Diff
neighbourhood_group,0.749872,0.040904,0.040904
room_type,0.728909,0.061867,0.061867
latitude,0.786788,0.003988,0.003988
longitude,0.786993,0.003784,0.003784
minimum_nights,0.791492,-0.000716,0.000716
number_of_reviews,0.791185,-0.000409,0.000409
reviews_per_month,0.790776,0.0,0.0
calculated_host_listings_count,0.789549,0.001227,0.001227
availability_365,0.78147,0.009306,0.009306


In [214]:
df_scores['Abs_Diff'].min()

0.0

In [215]:
df_scores[['Abs_Diff']].idxmin()

Abs_Diff    reviews_per_month
dtype: object

### <font color='Blue'>Question 5 -> Answer : feature has the smallest difference : reviews_per_month </font>

In [285]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error 

In [289]:
def model_build_lin_reg(X_train_data,y_train_data, X_val_data, y_val_data, alpha=1.0):
    
    model = Ridge(alpha)
    model.fit(X_train_data, y_train_data)  
    
    y_val_pred = model.predict(X_val_data)
    sub_score = model.score(X_val_data, y_val_data)
    rmse = mean_squared_error(y_val_data, y_val_pred, squared=False)

    return sub_score, rmse

In [290]:
X_train = apply_one_hot_encoding(df_train, all_columns) 
X_val = apply_one_hot_encoding(df_val, all_columns)

y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [291]:
for alpha in [0, 0.01, 0.1, 1, 10]:
    score,rsme = model_build_lin_reg(X_train,y_train_log, X_val, y_val_log,alpha)
    print("alpha : {}   , rsme : {},  round rsme : {}" .format( alpha ,    rsme, round(rsme,3)))


alpha : 0   , rsme : 0.4972204402832728,  round rsme : 0.497
alpha : 0.01   , rsme : 0.49711730461906295,  round rsme : 0.497
alpha : 0.1   , rsme : 0.49711832446943977,  round rsme : 0.497
alpha : 1   , rsme : 0.4971395363320048,  round rsme : 0.497
alpha : 10   , rsme : 0.49788660158765563,  round rsme : 0.498


### <font color='Blue'> Question 6 -> Answer : Which of these alphas leads to the best RMSE on the validation set : 0.01 </font>

In [270]:
y_train_log , y_val_log

(array([4.60517019, 4.06044301, 4.26267988, ..., 5.70378247, 4.18965474,
        4.53259949]),
 array([4.18965474, 4.49980967, 5.30330491, ..., 3.68887945, 4.26267988,
        4.35670883]))