In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

In [81]:
data = pd.read_csv('New_York_City_Airbnb_Open_Data.csv')

In [82]:
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [83]:
data.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [84]:
data.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [85]:
data.reviews_per_month.fillna(data.reviews_per_month.mean(), inplace=True)

In [86]:
data.neighbourhood_group.value_counts()

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

In [87]:
y = (data.price.values > 152).astype(int)
X= data.drop(['price','id','host_name', 'name','host_id','neighbourhood', 'last_review'], axis=1)

In [88]:
y[:10]

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0])

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.4)
X_valid, X_test, y_valid, y_test = train_test_split(X,y,random_state=42,test_size=0.5)

In [90]:
X_train.corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.087732,0.027252,-0.01008,-0.004373,0.019442,-0.005975
longitude,0.087732,1.0,-0.067251,0.058775,0.122718,-0.116669,0.080776
minimum_nights,0.027252,-0.067251,1.0,-0.085092,-0.093061,0.12406,0.141089
number_of_reviews,-0.01008,0.058775,-0.085092,1.0,0.520963,-0.072687,0.176481
reviews_per_month,-0.004373,0.122718,-0.093061,0.520963,1.0,-0.00457,0.165443
calculated_host_listings_count,0.019442,-0.116669,0.12406,-0.072687,-0.00457,1.0,0.222986
availability_365,-0.005975,0.080776,0.141089,0.176481,0.165443,0.222986,1.0


In [91]:

col=[  'neighbourhood_group', 'room_type']
for i in col:
    
    m_score = round( mutual_info_score(X_train[i],y_train), 2)
    print(f'{i}:{m_score}')


neighbourhood_group:0.05
room_type:0.14


In [100]:
X_train.columns

Index(['neighbourhood_group', 'latitude', 'longitude', 'room_type',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [93]:
#[['neighbourhood_group','room_type']]
X_train_dict = X_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train_nw = dv.fit_transform(X_train_dict)

X_valid_dict = X_valid.to_dict(orient='records')
X_valid_nw = dv.transform(X_valid_dict)

In [94]:
type(X_valid)

pandas.core.frame.DataFrame

In [95]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=Bronx',
 'neighbourhood_group=Brooklyn',
 'neighbourhood_group=Manhattan',
 'neighbourhood_group=Queens',
 'neighbourhood_group=Staten Island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=Entire home/apt',
 'room_type=Private room',
 'room_type=Shared room']

In [96]:
lr= LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=1000)
lr.fit(X_train_nw,y_train)

LogisticRegression(max_iter=1000, random_state=42)

In [106]:
y_valid_predicted=lr.predict(X_valid_nw)
round(accuracy_score(y_valid_predicted,y_valid),2)

0.79

In [110]:
data_features =['neighbourhood_group', 'latitude', 'longitude', 'room_type',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365']
for i in data_features:
    X_train_sample= X_train.drop(i, axis=1)
    X_valid_sample= X_valid.drop(i, axis=1)
    
    X_train_dict_sample = X_train_sample.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train_nw2 = dv.fit_transform(X_train_dict_sample)

    X_valid_dict_sample = X_valid.to_dict(orient='records')
    X_valid_nw2 = dv.transform(X_valid_dict_sample)
    
    model= LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=10000)
    model.fit(X_train_nw2,y_train)
    
    acc=round(accuracy_score(model.predict(X_valid_nw2),y_valid),4)
    
    print(f'model accuracy: {acc} [{round(acc-0.7948,4)}] after {i} feature was removed, ')

model accuracy: 0.7466 [-0.0482] after neighbourhood_group feature was removed, 
model accuracy: 0.7874 [-0.0074] after latitude feature was removed, 
model accuracy: 0.7874 [-0.0074] after longitude feature was removed, 
model accuracy: 0.7326 [-0.0622] after room_type feature was removed, 
model accuracy: 0.7913 [-0.0035] after minimum_nights feature was removed, 
model accuracy: 0.7953 [0.0005] after number_of_reviews feature was removed, 
model accuracy: 0.7956 [0.0008] after reviews_per_month feature was removed, 
model accuracy: 0.794 [-0.0008] after calculated_host_listings_count feature was removed, 
model accuracy: 0.7821 [-0.0127] after availability_365 feature was removed, 


In [116]:
ylog=np.log1p(data.price)
Xl_train,Xl_test,ylog_train,ylog_test= train_test_split(X,ylog, test_size=0.4, random_state=42)
Xl_valid,Xl_test,ylog_valid,ylog_test= train_test_split(Xl_test,ylog_test, test_size=0.5, random_state=42)

Xl_train_dict = Xl_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
Xl_train = dv.fit_transform(Xl_train_dict)

Xl_valid_dict = Xl_valid.to_dict(orient='records')
Xl_valid = dv.transform(Xl_valid_dict)

In [121]:
alpha =  [0, 0.01, 0.1, 1, 10]
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
for i in alpha:
    model_r = Ridge(alpha=i)
    model_r.fit(Xl_train,ylog_train)
    y_pred = model_r.predict(Xl_valid)
    rmse = round(mean_squared_error(y_pred, ylog_valid),3)
    print(f'RMSE for alpha {i}:    {rmse}')

RMSE for alpha 0:    0.236
RMSE for alpha 0.01:    0.236
RMSE for alpha 0.1:    0.236
RMSE for alpha 1:    0.236
RMSE for alpha 10:    0.237


  return linalg.solve(A, Xy, sym_pos=True,
