# Predict whether income exceeds $50K/yr based on census data. 
Also known as "Census Income" dataset.


Import some basic libraries.
* Pandas - provided data frames
* matplotlib.pyplot - plotting support

Use Magic %matplotlib to display graphics inline instead of in a popup window.


In [259]:
import pandas as pd     
import numpy as np            # pandas is a dataframe library
import matplotlib.pyplot as plt  
from sklearn.linear_model import LinearRegression    

%matplotlib inline

## Loading and Reviewing the Data

In [260]:
df = pd.read_csv("/Users/fedorcvetkov/Desktop/Проекты/ML flat price/berlin-airbnb-prices/berlin_airbnb_train.csv")
df1 = pd.read_csv('/Users/fedorcvetkov/Desktop/Проекты/ML flat price/berlin-airbnb-prices/berlin_airbnb_test.csv')
sample_submission = pd.read_csv('/Users/fedorcvetkov/Desktop/Проекты/ML flat price/submit1.csv')

In [261]:
df.shape

(16468, 22)

In [262]:
df.head(5)

Unnamed: 0,accommodates,bathrooms,bedrooms,price,cleaning_fee,security_deposit,extra_people,guests_included,distance,size,...,bed_type,minimum_nights,instant_bookable,is_business_travel_ready,cancellation_policy,Laptop_friendly_workspace,TV,Family_kid_friendly,Host_greets_you,Smoking_allowed
0,4,1.0,2.0,52.0,20.0,300.0,10.0,2,8.492239,20.0,...,Real Bed,2,f,f,strict_14_with_grace_period,False,False,True,False,False
1,2,1.0,1.0,125.0,45.0,200.0,0.0,1,3.951948,200.0,...,Real Bed,5,t,f,strict_14_with_grace_period,False,True,False,False,False
2,2,1.0,1.0,30.0,20.0,0.0,0.0,1,3.748695,45.749963,...,Real Bed,5,t,f,moderate,True,False,False,True,False
3,2,1.0,1.0,25.0,0.0,0.0,10.0,1,5.062518,18.0,...,Real Bed,14,f,f,moderate,True,False,True,False,False
4,2,1.0,1.0,54.0,0.0,0.0,9.0,1,3.184217,55.055173,...,Real Bed,1,f,f,flexible,False,True,False,True,False


In [263]:
df.tail(5)

Unnamed: 0,accommodates,bathrooms,bedrooms,price,cleaning_fee,security_deposit,extra_people,guests_included,distance,size,...,bed_type,minimum_nights,instant_bookable,is_business_travel_ready,cancellation_policy,Laptop_friendly_workspace,TV,Family_kid_friendly,Host_greets_you,Smoking_allowed
16463,2,1.0,1.0,29.0,10.0,0.0,12.0,1,3.016456,50.069809,...,Real Bed,1,f,f,moderate,True,False,True,True,False
16464,2,1.0,1.0,80.0,60.0,200.0,20.0,2,5.81109,46.0,...,Real Bed,3,t,f,moderate,True,True,False,True,False
16465,4,1.5,1.0,40.0,30.0,100.0,5.0,1,6.467802,15.0,...,Real Bed,14,f,f,moderate,False,False,True,True,False
16466,2,1.0,1.0,35.0,20.0,150.0,15.0,2,5.625139,53.715891,...,Real Bed,2,f,f,moderate,True,False,False,False,False
16467,3,1.5,1.0,69.0,25.0,0.0,0.0,1,3.61382,97.0,...,Real Bed,4,f,f,strict_14_with_grace_period,False,False,True,False,False


## Check for null values

# Deleting non-specific and dummy data:
host_has_profile_pic - because all of them are true

is_business_travel_ready - because most of them are fasle.

In [264]:
del df['host_has_profile_pic']
del df['is_business_travel_ready']

In [265]:
del df1['host_has_profile_pic']
del df1['is_business_travel_ready']

# Data Types

Inspect data types to see if there are any issues.  Data should be numeric.

## Converting string to num:
For each column we make a map to convert from string to number.

In [266]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16468 entries, 0 to 16467
Data columns (total 20 columns):
accommodates                 16468 non-null int64
bathrooms                    16468 non-null float64
bedrooms                     16468 non-null float64
price                        16468 non-null float64
cleaning_fee                 16468 non-null float64
security_deposit             16468 non-null float64
extra_people                 16468 non-null float64
guests_included              16468 non-null int64
distance                     16468 non-null float64
size                         16468 non-null float64
room_type                    16468 non-null object
bed_type                     16468 non-null object
minimum_nights               16468 non-null int64
instant_bookable             16468 non-null object
cancellation_policy          16468 non-null object
Laptop_friendly_workspace    16468 non-null bool
TV                           16468 non-null bool
Family_kid_friendly    

In [267]:
df['room_type'].unique()


array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

In [268]:
df['bed_type'].unique()


array(['Real Bed', 'Pull-out Sofa', 'Futon', 'Airbed', 'Couch'],
      dtype=object)

In [269]:
df['instant_bookable'].unique()


array(['f', 't'], dtype=object)

In [270]:
df['cancellation_policy'].unique()


array(['strict_14_with_grace_period', 'moderate', 'flexible',
       'super_strict_30', 'super_strict_60'], dtype=object)

In [271]:
df['Laptop_friendly_workspace'].unique()


array([False,  True])

In [272]:
df['TV'].unique()

array([False,  True])

In [273]:
df['Family_kid_friendly'].unique()

array([ True, False])

In [274]:
df['Host_greets_you'].unique()

array([False,  True])

In [275]:
df['Smoking_allowed'].unique()

array([False,  True])

In [276]:
room_type_map = {'Shared room' : 0.0, 'Private room' : 1.0, 'Entire home/apt' : 2.0, 'Airbed' : 3.0, 'Couch' : 4.0}
df['room_type'] = df['room_type'].map(room_type_map)

bed_type_map = {'Pull-out Sofa' : 0.0, 'Futon' : 1.0, 'Real Bed' : 2.0, 'Airbed' : 3.0, 'Couch' : 4.0}
df['bed_type'] = df['bed_type'].map(bed_type_map)

instant_bookable_map = {'t' : 1.0, 'f' : 0.0}
df['instant_bookable'] = df['instant_bookable'].map(instant_bookable_map)

cancellation_policy_map = {'flexible' : 0.0, 'moderate' : 1.0, 'strict_14_with_grace_period' : 2.0, 'super_strict_30' : 3.0, 'super_strict_60' : 4.0}
df['cancellation_policy'] = df['cancellation_policy'].map(cancellation_policy_map)

#Laptop_friendly_workspace_map = {'False' : 0, 'True' : 1}
#df['Laptop_friendly_workspace'] = df['Laptop_friendly_workspace'].map(Laptop_friendly_workspace_map)
df['Laptop_friendly_workspace']= df['Laptop_friendly_workspace'].astype(np.float64)

df['TV']= df['TV'].astype(np.float64)

df['Family_kid_friendly']= df['Family_kid_friendly'].astype(np.float64)

df['Host_greets_you']= df['Host_greets_you'].astype(np.float64)

df['Smoking_allowed']= df['Smoking_allowed'].astype(np.float64)
#TV_map = {'FALSE' : 0, 'TRUE' : 1}
#df['TV'] = df['TV'].map(TV_map)

#Family_kid_friendly_map = {'False' : 0, ' True' : 1}
#df['Family_kid_friendly'] = df['Family_kid_friendly'].map(Family_kid_friendly_map)

#Host_greets_you_map = {'FALSE' : 0, 'TRUE' : 1}
#df['Host_greets_you'] = df['Host_greets_you'].map(Host_greets_you_map)

#Smoking_allowed_map = {'False' : 0, 'True' : 1}
#df['Smoking_allowed'] = df['Smoking_allowed'].map(Smoking_allowed_map)



In [277]:
room_type_map = {'Shared room' : 0.0, 'Private room' : 1.0, 'Entire home/apt' : 2.0, 'Airbed' : 3.0, 'Couch' : 4.0}
df1['room_type'] = df1['room_type'].map(room_type_map)

bed_type_map = {'Pull-out Sofa' : 0.0, 'Futon' : 1.0, 'Real Bed' : 2.0, 'Airbed' : 3.0, 'Couch' : 4.0}
df1['bed_type'] = df1['bed_type'].map(bed_type_map)

instant_bookable_map = {'t' : 1.0, 'f' : 0.0}
df1['instant_bookable'] = df1['instant_bookable'].map(instant_bookable_map)

cancellation_policy_map = {'flexible' : 0.0, 'moderate' : 1.0, 'strict_14_with_grace_period' : 2.0, 'super_strict_30' : 3.0, 'super_strict_60' : 4.0}
df1['cancellation_policy'] = df1['cancellation_policy'].map(cancellation_policy_map)

#Laptop_friendly_workspace_map = {'False' : 0, 'True' : 1}
#df['Laptop_friendly_workspace'] = df['Laptop_friendly_workspace'].map(Laptop_friendly_workspace_map)
df1['Laptop_friendly_workspace']= df1['Laptop_friendly_workspace'].astype(np.float64)

df1['TV']= df1['TV'].astype(np.float64)

df1['Family_kid_friendly']= df1['Family_kid_friendly'].astype(np.float64)

df1['Host_greets_you']= df1['Host_greets_you'].astype(np.float64)

df1['Smoking_allowed']= df1['Smoking_allowed'].astype(np.float64)

In [278]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16468 entries, 0 to 16467
Data columns (total 20 columns):
accommodates                 16468 non-null int64
bathrooms                    16468 non-null float64
bedrooms                     16468 non-null float64
price                        16468 non-null float64
cleaning_fee                 16468 non-null float64
security_deposit             16468 non-null float64
extra_people                 16468 non-null float64
guests_included              16468 non-null int64
distance                     16468 non-null float64
size                         16468 non-null float64
room_type                    16468 non-null float64
bed_type                     16468 non-null float64
minimum_nights               16468 non-null int64
instant_bookable             16468 non-null float64
cancellation_policy          16468 non-null float64
Laptop_friendly_workspace    16468 non-null float64
TV                           16468 non-null float64
Family_kid_fr

Just some text to have space between two parts



In [279]:
df.corr()

Unnamed: 0,accommodates,bathrooms,bedrooms,price,cleaning_fee,security_deposit,extra_people,guests_included,distance,size,room_type,bed_type,minimum_nights,instant_bookable,cancellation_policy,Laptop_friendly_workspace,TV,Family_kid_friendly,Host_greets_you,Smoking_allowed
accommodates,1.0,0.244119,0.624151,0.572825,0.345674,0.119223,0.238968,0.507672,0.058999,0.324393,0.393998,0.052977,0.002155,0.074814,0.200218,0.068465,0.238482,0.34994,0.023572,-0.105068
bathrooms,0.244119,1.0,0.299919,0.252917,0.141415,0.05814,0.064329,0.167568,0.022311,0.208128,0.006432,0.018181,-0.002394,0.01325,0.038757,0.03058,0.082128,0.084417,0.004309,-0.008691
bedrooms,0.624151,0.299919,1.0,0.471137,0.272044,0.11695,0.13892,0.3716,0.048489,0.261931,0.238154,0.038845,0.004455,-0.012103,0.120816,0.052212,0.148449,0.260328,0.019485,-0.092371
price,0.572825,0.252917,0.471137,1.0,0.400418,0.17701,0.189393,0.417486,-0.064085,0.419729,0.452589,0.044827,0.030003,0.042686,0.20341,0.084699,0.265729,0.234975,0.044367,-0.150515
cleaning_fee,0.345674,0.141415,0.272044,0.400418,1.0,0.426571,0.252798,0.286336,0.004211,0.288628,0.360535,0.025058,0.108245,0.01787,0.308902,0.12056,0.236943,0.190704,0.114128,-0.153712
security_deposit,0.119223,0.05814,0.11695,0.17701,0.426571,1.0,0.123865,0.103002,-0.007139,0.152926,0.195302,0.006762,0.129478,-0.052223,0.190306,0.045322,0.117211,0.084702,0.056503,-0.089796
extra_people,0.238968,0.064329,0.13892,0.189393,0.252798,0.123865,1.0,0.322429,0.003816,0.182756,0.055658,0.003447,-0.006922,0.020086,0.217387,0.087636,0.086148,0.136759,0.138789,-0.035623
guests_included,0.507672,0.167568,0.3716,0.417486,0.286336,0.103002,0.322429,1.0,0.014502,0.219842,0.262869,0.021305,0.011363,0.040728,0.190085,0.051326,0.151793,0.226091,0.041528,-0.081339
distance,0.058999,0.022311,0.048489,-0.064085,0.004211,-0.007139,0.003816,0.014502,1.0,-0.098646,0.019858,-0.001815,0.005327,0.056481,-0.02988,0.0194,0.130416,0.039971,0.050364,-0.048867
size,0.324393,0.208128,0.261931,0.419729,0.288628,0.152926,0.182756,0.219842,-0.098646,1.0,0.224364,0.024537,0.011177,0.014969,0.144067,0.067237,0.156684,0.147691,0.025013,-0.084157


In [280]:
df.head(5)

Unnamed: 0,accommodates,bathrooms,bedrooms,price,cleaning_fee,security_deposit,extra_people,guests_included,distance,size,room_type,bed_type,minimum_nights,instant_bookable,cancellation_policy,Laptop_friendly_workspace,TV,Family_kid_friendly,Host_greets_you,Smoking_allowed
0,4,1.0,2.0,52.0,20.0,300.0,10.0,2,8.492239,20.0,1.0,2.0,2,0.0,2.0,0.0,0.0,1.0,0.0,0.0
1,2,1.0,1.0,125.0,45.0,200.0,0.0,1,3.951948,200.0,2.0,2.0,5,1.0,2.0,0.0,1.0,0.0,0.0,0.0
2,2,1.0,1.0,30.0,20.0,0.0,0.0,1,3.748695,45.749963,1.0,2.0,5,1.0,1.0,1.0,0.0,0.0,1.0,0.0
3,2,1.0,1.0,25.0,0.0,0.0,10.0,1,5.062518,18.0,1.0,2.0,14,0.0,1.0,1.0,0.0,1.0,0.0,0.0
4,2,1.0,1.0,54.0,0.0,0.0,9.0,1,3.184217,55.055173,1.0,2.0,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [281]:
df1.head(5)

Unnamed: 0,accommodates,bathrooms,bedrooms,cleaning_fee,security_deposit,extra_people,guests_included,distance,size,room_type,bed_type,minimum_nights,instant_bookable,cancellation_policy,Laptop_friendly_workspace,TV,Family_kid_friendly,Host_greets_you,Smoking_allowed
0,2,1.0,0.0,0.0,0.0,0.0,1,3.655893,49.29849,2.0,2.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,1.5,2.0,39.0,200.0,10.0,2,4.043631,65.019556,2.0,2.0,6,0.0,2.0,1.0,0.0,0.0,0.0,0.0
2,2,1.0,1.0,40.0,0.0,50.0,1,3.225601,88.24157,1.0,2.0,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0
3,3,1.0,0.0,0.0,0.0,7.0,1,2.860649,10.0,2.0,2.0,28,0.0,2.0,0.0,1.0,0.0,1.0,0.0
4,2,1.0,1.0,20.0,100.0,15.0,1,4.58386,75.0,1.0,2.0,3,0.0,2.0,1.0,0.0,1.0,0.0,0.0


The correlations look good.  There appear to be no coorelated columns.

### Spliting the data 

70% for training, 30% for testing

In [282]:
from sklearn.model_selection import train_test_split

feature_col_names = ['accommodates', 'bathrooms', 'bedrooms', 'cleaning_fee', 'security_deposit', 'distance', 'size', 'room_type', 'bed_type', 'instant_bookable', 'cancellation_policy', 'Laptop_friendly_workspace', 'TV', 'Family_kid_friendly', 'Smoking_allowed'] #,   'minimum_nights', , 'extra_people', , 'guests_included' ,  , 'Host_greets_you'
predicted_class_names = ['price']

X = df[feature_col_names].values     # predictor feature columns (8 X m)
y = df[predicted_class_names].values # predicted class (1=true, 0=false) column (1 X m)
split_test_size = 0.17

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42) 
                            # test_size = 0.2 is 20%, 42 is the answer to everything

In [283]:
from sklearn.model_selection import train_test_split

feature_col_names = ['accommodates', 'bathrooms', 'bedrooms', 'cleaning_fee', 'security_deposit', 'distance', 'size', 'room_type', 'bed_type', 'instant_bookable', 'cancellation_policy', 'Laptop_friendly_workspace', 'TV', 'Family_kid_friendly', 'Smoking_allowed'] #,   'minimum_nights', , 'extra_people', , 'guests_included' ,  , 'Host_greets_you'
predicted_class_names = ['price']

X1 = df1[feature_col_names].values     # predictor feature columns (8 X m)
#y = df1[predicted_class_names].values # predicted class (1=true, 0=false) column (1 X m)
split_test_size = 1.0


                            # test_size = 0.2 is 20%, 42 is the answer to everything

We check to ensure we have the the desired 70% train, 30% test split of the data

In [284]:
print("{0:0.2f}% in training set".format((len(X_train)/len(df.index)) * 100))
print("{0:0.2f}% in test set".format((len(X_test)/len(df.index)) * 100))

83.00% in training set
17.00% in test set


## Training Initial Algorithm - Naive Bayes

In [285]:
from sklearn.naive_bayes import GaussianNB

# create Gaussian Naive Bayes model object and train it with the data
nb_model = GaussianNB()

nb_model.fit(X_train, y_train.ravel())

GaussianNB(priors=None, var_smoothing=1e-09)

### Performance on Training Data

In [286]:
# predict values using the training data
nb_predict_train = nb_model.predict(X_train)

# import the performance metrics library
from sklearn import metrics

# Accuracy
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, nb_predict_train)))
print()

Accuracy: 0.0190



### Performance on Testing Data

In [287]:
# predict values using the testing data
nb_predict_test = nb_model.predict(X_test)

from sklearn import metrics

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, nb_predict_test)))


Accuracy: 0.0046


Trying to make Linear Regression

#### Metrics

In [288]:
#print("Confusion Matrix")
# Note the use of labels for set 1=True to upper left and 0=False to lower right
#print(f"{0}".format(metrics.confusion_matrix(y_test, nb_predict_test, labels=[1, 0])))
#print("")

#print("Classification Report")
#print(metrics.classification_report(y_test, nb_predict_test, labels=[1,0]))


In [289]:
scaled_features = df.copy()

In [290]:
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import MinMaxScaler
col_names = ['size'] #distance',
features = scaled_features[col_names]
scaler = MinMaxScaler().fit(features.values)
features = scaler.transform(features.values)
df[col_names] = features
col_names = ['distance', 'size'] #distance',
features = scaled_features[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
df[col_names] = features
col_names = ['bedrooms', 'bathrooms']
features = scaled_features[col_names]
scaler = MinMaxScaler().fit(features.values)
features = scaler.transform(features.values)
df[col_names] = features
#col_names = ['bedrooms', 'bathrooms']
#features = scaled_features[col_names]
#scaler = StandardScaler().fit(features.values)
#features = scaler.transform(features.values)
#df[col_names] = features

## Random Forest

In [291]:
#from sklearn.preprocessing import StandardScaler 
#scale_features_std = StandardScaler() 
#X_train = scale_features_std.fit_transform(X_train) 
#X_test = scale_features_std.transform(X_test) 
#print(n_features)

In [292]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import math
#n_features = 12
#rf_model =  RandomForestRegressor(n_estimators=300)
#rf_model = RandomForestClassifier(n_estimators=500,bootstrap=True,max_features=n_features)      # Create random forest object rf_model = RandomForestClassifier(random_state=42) 
#rf_model.fit(X_train, y_train.ravel()) 

regressor = RandomForestRegressor(n_estimators=10000, bootstrap=True, oob_score=True, random_state=0)
regressor.fit(X_train, y_train.ravel())
y_pred = regressor.predict(X_test)

### Predict Training Data

In [293]:
from sklearn import metrics
from sklearn.metrics import r2_score

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print(r2_score(y_test, y_pred, multioutput='variance_weighted'))

Mean Absolute Error: 14.722822214285713
Mean Squared Error: 650.6809186983143
Root Mean Squared Error: 25.508447986859455
0.6322828368479144


In [294]:
predictions = regressor.predict(X1)

In [295]:
sample_submission.tail()

Unnamed: 0,ID,price
5485,9537,37.545465
5486,264,49.757493
5487,21268,51.421611
5488,1932,50.611679
5489,18537,48.430016


In [296]:
sample_submission.head()

Unnamed: 0,ID,price
0,21787,36.210829
1,3199,67.251542
2,9990,60.19942
3,13052,40.990933
4,6004,52.299018


In [297]:
sample_submission['price'] = predictions

In [298]:
sample_submission.to_csv('my_submission.csv', index=False)

In [299]:
rf_predict_train = rf_model.predict(X_train)
# training metrics
#print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))
#print("Accuracy: {0:.4f}".format(rf_model.score(y_train, rf_predict_train))) #regressor.score(X_train, y_train)

NameError: name 'rf_model' is not defined

### Predict Test Data

In [0]:
rf_predict_test = rf_model.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(rf_model.score(y_test, rf_predict_test)))

In [0]:
#print(metrics.confusion_matrix(y_test, rf_predict_test, labels=[1, 0]) )
#print("")
#print("Classification Report")
#print(metrics.classification_report(y_test, rf_predict_test, labels=[1,0]))