In [1]:
import pandas as pd
import numpy as np
from numpy import math
from datetime import datetime
import re

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

import chart_studio.plotly as py 
import seaborn as sns
import plotly.express as px
import cufflinks as cf
import plotly.graph_objects as go #this is used for making more customizable graphs 

# if you want plotly to work in your jupiter notbook
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()

#if y ou want everything to show run the command below
%matplotlib inline




# Load the data

In [2]:

url1 = 'https://raw.githubusercontent.com/haganjonathan42/ReDI_project/main/airnb.csv'

In [3]:
df1 = pd.read_csv(url1)

In [4]:
df1.head(5)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Offer price(in dollar),Review and rating,Number of bed
0,"Chalet in Skykomish, Washington, US",Sky Haus - A-Frame Cabin,Jun 11 - 16,306.0,229.0,4.85 (531),4 beds
1,"Cabin in Hancock, New York, US",The Catskill A-Frame - Mid-Century Modern Cabin,Jun 6 - 11,485.0,170.0,4.77 (146),4 beds
2,"Cabin in West Farmington, Ohio, US",The Triangle: A-Frame Cabin for your city retreat,Jul 9 - 14,119.0,522.0,4.91 (515),4 beds
3,"Home in Blue Ridge, Georgia, US",*Summer Sizzle* 5 Min to Blue Ridge* Pets* Hot...,Jun 11 - 16,192.0,348.0,4.94 (88),5 beds
4,"Treehouse in Grandview, Texas, US",Luxury Treehouse Couples Getaway w/ Peaceful V...,Jun 4 - 9,232.0,196.0,4.99 (222),1 queen bed


In [5]:
#To get an over view of the data
# The data types and the quantity of missing values in the data

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Title                   953 non-null    object
 1   Detail                  953 non-null    object
 2   Date                    953 non-null    object
 3   Price(in dollar)        953 non-null    object
 4   Offer price(in dollar)  166 non-null    object
 5   Review and rating       953 non-null    object
 6   Number of bed           953 non-null    object
dtypes: object(7)
memory usage: 52.2+ KB


# Preprocessing and cleaning the data.

### Review and Rating

In [6]:
# Create a two new coloums one for Ratinngs and another for Num. Reviews from the Reviews and ratings column

# Split the Review and rating on space and table the index of 0 for ratings.
# Do the same for the second column but take the index of 1 and strip the brackets.
df1['Rating'] = df1['Review and rating'].str.split(' ', expand=True)[0]
df1['Number_of_reviews'] = df1['Review and rating'].str.split(' ', expand=True)[1].str.strip('()')

df1.head(5)


Unnamed: 0,Title,Detail,Date,Price(in dollar),Offer price(in dollar),Review and rating,Number of bed,Rating,Number_of_reviews
0,"Chalet in Skykomish, Washington, US",Sky Haus - A-Frame Cabin,Jun 11 - 16,306.0,229.0,4.85 (531),4 beds,4.85,531
1,"Cabin in Hancock, New York, US",The Catskill A-Frame - Mid-Century Modern Cabin,Jun 6 - 11,485.0,170.0,4.77 (146),4 beds,4.77,146
2,"Cabin in West Farmington, Ohio, US",The Triangle: A-Frame Cabin for your city retreat,Jul 9 - 14,119.0,522.0,4.91 (515),4 beds,4.91,515
3,"Home in Blue Ridge, Georgia, US",*Summer Sizzle* 5 Min to Blue Ridge* Pets* Hot...,Jun 11 - 16,192.0,348.0,4.94 (88),5 beds,4.94,88
4,"Treehouse in Grandview, Texas, US",Luxury Treehouse Couples Getaway w/ Peaceful V...,Jun 4 - 9,232.0,196.0,4.99 (222),1 queen bed,4.99,222


In [7]:
# Dealing with missing values in the column Number_of_reviews

missing_num_reviews = df1[df1['Number_of_reviews'].isnull()]
missing_num_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22 entries, 18 to 804
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Title                   22 non-null     object
 1   Detail                  22 non-null     object
 2   Date                    22 non-null     object
 3   Price(in dollar)        22 non-null     object
 4   Offer price(in dollar)  5 non-null      object
 5   Review and rating       22 non-null     object
 6   Number of bed           22 non-null     object
 7   Rating                  22 non-null     object
 8   Number_of_reviews       0 non-null      object
dtypes: object(9)
memory usage: 1.7+ KB


In [8]:
# Here wehre number of reviews is missing it was replaced with 0

df1['Number_of_reviews'] = df1['Number_of_reviews'].fillna(0)
df1['Number_of_reviews'].isnull().unique()

array([False])

In [9]:
# Dealing with missing dat in the rating column 

# Here we replace New with nan so that we are not confused by people rating 0 in rows where there is no rating
df1.loc[df1['Rating'] == 'New', 'Rating'] = np.nan
df1.sample(5)



Unnamed: 0,Title,Detail,Date,Price(in dollar),Offer price(in dollar),Review and rating,Number of bed,Rating,Number_of_reviews
376,"Tiny home in Muldenhammer, Germany",Hascherle Hitt,Jun 5 - 10,86.0,,5.0 (57),2 beds,5.0,57
211,"Boat in Stuart, Florida, US",House Boat Sunset Bay Marina Stuart Fl,Sep 10 - 15,127.0,,4.92 (279),2 beds,4.92,279
163,"Condo in Ocean City, Maryland, US",OceanfrontStudio-Slps4-Pool-Pickleball-HBOMax-...,Jun 9 - 16,291.0,191.0,4.94 (80),2 beds,4.94,80
690,"Apartment in Tambon Patong, Thailand",1 BR Phuket Modern Luxury Living,Jun 6 - 11,44.0,,4.68 (202),1 king bed,4.68,202
470,"Earthen home in Mueang Chiang Mai District, Th...",Leafy Greens Chiangmai : Mushroom M2,Jun 13 - 20,22.0,,4.87 (210),1 bunk bed,4.87,210


In [10]:
missing_rating = df1[df1['Rating'].isnull()]
missing_rating.head(5)

#this was fpound to be true because where there is no rating the number of reviews was 0

Unnamed: 0,Title,Detail,Date,Price(in dollar),Offer price(in dollar),Review and rating,Number of bed,Rating,Number_of_reviews
18,"Treehouse in Travelers Rest, South Carolina, US",The Forestry House - A modern luxury treehouse.,Aug 20 - 25,282.0,417.0,New,1 bed,,0
98,"Guesthouse in Destin, Florida, US",New Carriage House * 1 Block to Beach * King Bed!,Aug 13 - 18,231.0,,New,1 king bed,,0
100,"Home in Bellingham, Washington, US",Bellingham A-Frame - Hot tub & Firepit,Jun 11 - 16,232.0,,New,2 beds,,0
101,"Apartment in San Juan, Puerto Rico","Villa Bohème 3, 1BR with Patio",Jul 8 - 13,115.0,,New,1 bed,,0
111,"Treehouse in Wardensville, West Virginia, US",Eagles Nest Treehouse w/Hot tub!,Jun 4 - 10,244.0,63.0,New,3 beds,,0


In [11]:
# Making sure that the column was in the right data type

df1['Rating'] = df1['Rating'].replace('None', np.nan).astype(float)
df1[['Number_of_reviews']] = df1[['Number_of_reviews']].astype(int)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Title                   953 non-null    object 
 1   Detail                  953 non-null    object 
 2   Date                    953 non-null    object 
 3   Price(in dollar)        953 non-null    object 
 4   Offer price(in dollar)  166 non-null    object 
 5   Review and rating       953 non-null    object 
 6   Number of bed           953 non-null    object 
 7   Rating                  931 non-null    float64
 8   Number_of_reviews       953 non-null    int32  
dtypes: float64(1), int32(1), object(7)
memory usage: 63.4+ KB


### Date 

In [12]:
df1[['Start Date', 'End Date']] = df1['Date'].str.split(' - ', expand=True)
df1['Month'] = df1['Start Date'].str.split().str[0]
df1['New End Date'] = df1['Month'] + ' ' + df1['End Date']
df1.sample(10)



Unnamed: 0,Title,Detail,Date,Price(in dollar),Offer price(in dollar),Review and rating,Number of bed,Rating,Number_of_reviews,Start Date,End Date,Month,New End Date
42,"Chalet in Cajicá, Colombia",Casa Anís,Jun 1 - 6,40.0,,4.96 (202),3 beds,4.96,202,Jun 1,6,Jun,Jun 6
84,"Loft in The Blue Mountains, Canada",Chateau Ridge Chalet w/ Mountain View,Jun 5 - 12,154.0,,4.84 (95),4 beds,4.84,95,Jun 5,12,Jun,Jun 12
246,"Home in Oceanside, Oregon, US",Movie Theater + Kid Friendly | Book the Dome!,Jun 4 - 9,200.0,,4.97 (64),1 double bed,4.97,64,Jun 4,9,Jun,Jun 9
950,Guesthouse in San Antonio,Casa Alexander,May 1 - 6,73.0,,4.92 (282),5 beds,4.92,282,May 1,6,May,May 6
462,"Apartment in Phuket, Thailand, Thailand",White Breeze Pool 1BD Apartment,Jun 4 - 10,64.0,,4.69 (112),1 queen bed,4.69,112,Jun 4,10,Jun,Jun 10
429,"Campsite in Rif, Iceland",Washitsu.Casa camper Parking place w/house access,Jun 1 - 6,16.0,,4.91 (107),1 double bed,4.91,107,Jun 1,6,Jun,Jun 6
173,"Cottage in Oceanside, California, US","Beach Bungalow on the Sand, Steps to the Ocean",Jun 3 - 8,483.0,,4.93 (213),2 beds,4.93,213,Jun 3,8,Jun,Jun 8
202,"Cabin in Puerto Viejo de Talamanca, Costa Rica",Tiny Beach front house,Jul 24 - 29,89.0,,4.73 (140),2 beds,4.73,140,Jul 24,29,Jul,Jul 29
639,"Apartment in Tambon Kamala, Thailand",(A)Charming Pool Access One Bedroom Apartment,Jun 3 - 8,65.0,,4.82 (68),1 double bed,4.82,68,Jun 3,8,Jun,Jun 8
177,"Farm stay in Saint Marys, Canada",Cozy Farm Stay Getaway,Jun 11 - 16,99.0,,4.86 (183),2 queen beds,4.86,183,Jun 11,16,Jun,Jun 16


In [13]:
# The new New End Date column has two months (Jul Aug 5)
df1.iloc[607]

Title                     Villa in General Luna, Philippines
Detail                                 Oceanfront Pool Villa
Date                                          Jul 30 - Aug 5
Price(in dollar)                                      163.00
Offer price(in dollar)                                   NaN
Review and rating                                  4.96 (23)
Number of bed                                     1 king bed
Rating                                                  4.96
Number_of_reviews                                         23
Start Date                                            Jul 30
End Date                                               Aug 5
Month                                                    Jul
New End Date                                       Jul Aug 5
Name: 607, dtype: object

In [14]:
# Dealing with the issue of double mont if the month in the rogib=nal date spands more than on month 

for index, row in df1.iterrows():
    end_date = row['New End Date']
    if len(end_date.split()) > 2:
        end_date = ' '.join(end_date.split()[1:])
    # Update the 'New End Date' column in the DataFrame
    df1.at[index, 'New End Date'] = end_date

In [15]:

df1.iloc[607]

Title                     Villa in General Luna, Philippines
Detail                                 Oceanfront Pool Villa
Date                                          Jul 30 - Aug 5
Price(in dollar)                                      163.00
Offer price(in dollar)                                   NaN
Review and rating                                  4.96 (23)
Number of bed                                     1 king bed
Rating                                                  4.96
Number_of_reviews                                         23
Start Date                                            Jul 30
End Date                                               Aug 5
Month                                                    Jul
New End Date                                           Aug 5
Name: 607, dtype: object

In [16]:
# convert the dates into the right data formats and date type 
# In the dataset the date didi noit have a year so we set it to 2018
# Calculate the data difference 

df1['Start Date'] = pd.to_datetime(df1['Start Date'] + ' 2018', format='%b %d %Y')
df1['New End Date'] = pd.to_datetime(df1['New End Date'] + ' 2018', format='%b %d %Y')
df1['Date Difference'] = (df1['New End Date'] - df1['Start Date']).dt.days



In [17]:
df1.head(2)


Unnamed: 0,Title,Detail,Date,Price(in dollar),Offer price(in dollar),Review and rating,Number of bed,Rating,Number_of_reviews,Start Date,End Date,Month,New End Date,Date Difference
0,"Chalet in Skykomish, Washington, US",Sky Haus - A-Frame Cabin,Jun 11 - 16,306.0,229.0,4.85 (531),4 beds,4.85,531,2018-06-11,16,Jun,2018-06-16,5
1,"Cabin in Hancock, New York, US",The Catskill A-Frame - Mid-Century Modern Cabin,Jun 6 - 11,485.0,170.0,4.77 (146),4 beds,4.77,146,2018-06-06,11,Jun,2018-06-11,5


In [18]:
df1.drop(['Month', 'End Date', 'Review and rating', 'Offer price(in dollar)'], axis=1, inplace=True)

In [19]:
df1.sample(2)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference
402,"Home in Atouguia da Baleia, Portugal","Spectacular view of Supertubos beach, Peniche",Oct 1 - 6,126.0,3 beds,4.82,34,2018-10-01,2018-10-06,5
390,"Tiny home in Muldenhammer, Germany",Hascherle Hitt,Jun 5 - 10,86.0,1 bed,5.0,57,2018-06-05,2018-06-10,5


### Dealing with the City 

In [20]:
# Split the 'Title' column into 'Facility' and others column 
# the column of interest here is the facility 
df1[['Facility', 'others']] = df1['Title'].str.split(' in ', n= 2, expand=True)
df1.sample(2)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,others
487,"Home in Kuta, Indonesia",Villa Seminyak 1 Bedroom 10 Mins to Beach,Jun 11 - 16,69.0,1 king bed,4.67,340,2018-06-11,2018-06-16,5,Home,"Kuta, Indonesia"
870,Guest suite in Los Angeles,Echo Park Retreat with Views,May 1 - 6,116.0,4 double beds,4.94,580,2018-05-01,2018-05-06,5,Guest suite,Los Angeles


In [21]:
# from the othjers column extract the city 
df1['City'] = df1['others'].str.split(',').str[0]
df1.sample(2)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,others,City
499,"Villa in Langkawi, Malaysia",Alamanda Tropical Wooden Villa - Pool View,Jun 15 - 20,97.0,2 beds,4.87,196,2018-06-15,2018-06-20,5,Villa,"Langkawi, Malaysia",Langkawi
775,Cabin in Newry,Cozy Cabin on Sunday River ~,May 1 - 6,202.0,2 beds,4.93,41,2018-05-01,2018-05-06,5,Cabin,Newry,Newry


In [22]:
# Country

df1['Country'] = df1['others'].str.split(',').apply(lambda x: x[-1].strip() if len(x) > 1 else None)
df1.sample(5)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,others,City,Country
403,"Home in Bad Sankt Leonhard im Lavanttal, Austria",1A chalet resting crickets + hiking with indoo...,Jun 22 - 27,123.0,3 beds,4.67,110,2018-06-22,2018-06-27,5,Home,"Bad Sankt Leonhard im Lavanttal, Austria",Bad Sankt Leonhard im Lavanttal,Austria
531,"Home in Kecamatan Tampaksiring, Indonesia",Six-Bamboo Villa in Eco Six Bali Resort,Sep 30 - Oct 6,458.0,1 bed,5.0,5,2018-09-30,2018-10-06,6,Home,"Kecamatan Tampaksiring, Indonesia",Kecamatan Tampaksiring,Indonesia
242,"Treehouse in Zona Hotelera, Mexico","Tree House - Beach Area, KS bed, 3 PPL",Jul 10 - 15,157.0,1 double bed,4.75,165,2018-07-10,2018-07-15,5,Treehouse,"Zona Hotelera, Mexico",Zona Hotelera,Mexico
938,Loft in Brooklyn,Authentic Luxury Designed Loft,May 1 - 6,178.0,3 beds,4.88,253,2018-05-01,2018-05-06,5,Loft,Brooklyn,Brooklyn,
76,"Tiny home in Maynooth, Canada",Cozy Cabin #2 - private cabin in the wilderness!,Jun 4 - 9,71.0,2 beds,4.89,90,2018-06-04,2018-06-09,5,Tiny home,"Maynooth, Canada",Maynooth,Canada


In [23]:
missing_country = df1[df1['Country'].isnull()]
missing_country.head(5)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,others,City,Country
702,Tiny home in Millersburg,Forest Haven - Otium,May 1 - 6,224.0,1 queen bed,4.85,218,2018-05-01,2018-05-06,5,Tiny home,Millersburg,Millersburg,
703,Treehouse in Crane Hill,WANDERLUST TREEHOUSE Book an experience!,May 1 - 6,402.0,1 queen bed,4.99,271,2018-05-01,2018-05-06,5,Treehouse,Crane Hill,Crane Hill,
704,Home in Telluride,Architectural Masterpiece | Best View in Tellu...,May 1 - 6,732.0,5 beds,4.97,76,2018-05-01,2018-05-06,5,Home,Telluride,Telluride,
705,Cabin in West Farmington,The Triangle: A-Frame Cabin for your city retreat,May 1 - 6,123.0,4 beds,4.91,515,2018-05-01,2018-05-06,5,Cabin,West Farmington,West Farmington,
706,Cabin in Crosby,Cozy Louise Cabin - Direct Cuyuna MTB Access/S...,May 1 - 6,225.0,2 queen beds,5.0,18,2018-05-01,2018-05-06,5,Cabin,Crosby,Crosby,


In [24]:
# State 

df1['State'] = df1['others'].apply(lambda x: x.split(',')[-2].strip() if ',' in x else None)
df1.sample(5)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,others,City,Country,State
906,Condo in Tybee Island,Direct Waterfront.Keyless entry.Dolphin watch.,May 1 - 6,220.0,2 beds,4.97,279,2018-05-01,2018-05-06,5,Condo,Tybee Island,Tybee Island,,
379,"Guesthouse in Perranwell Station, UK",The Folly - Cornwall,Jun 18 - 23,111.0,1 bed,5.0,11,2018-06-18,2018-06-23,5,Guesthouse,"Perranwell Station, UK",Perranwell Station,UK,Perranwell Station
839,Apartment in Galveston,Beach Front Condo In Casa Del Mar,May 1 - 6,122.0,2 beds,4.72,389,2018-05-01,2018-05-06,5,Apartment,Galveston,Galveston,,
827,Home in Kissimmee,House by the Mouse: Something for Everyone!,May 19 - 24,250.0,6 beds,4.89,157,2018-05-19,2018-05-24,5,Home,Kissimmee,Kissimmee,,
547,"Place to stay in Kecamatan Kuta Selatan, Indon...",Gypsea Bali Bungalow with private bathroom and AC,Sep 11 - 16,155.0,1 king bed,4.92,78,2018-09-11,2018-09-16,5,Place to stay,"Kecamatan Kuta Selatan, Indonesia",Kecamatan Kuta Selatan,Indonesia,Kecamatan Kuta Selatan


In [25]:

# Check if 'State' and 'City' columns are the same
same_location = df1['State'] == df1['City']

# Replace 'State' column with 'None' for matching locations
df1.loc[same_location, 'State'] = 'None'

df1.sample(5)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,others,City,Country,State
691,"Apartment in Shah Alam, Malaysia",*SETIA ALAM* Big Luxury Comfort Homestay #Trefoil,Jun 7 - 12,30.0,1 king bed,4.87,171,2018-06-07,2018-06-12,5,Apartment,"Shah Alam, Malaysia",Shah Alam,Malaysia,
107,"Hut in Uvero Alto - Bavaro - Punta Cana, Domin...",Hut #2 Romantic Luxury on the sand,Aug 1 - 6,186.0,2 beds,4.94,226,2018-08-01,2018-08-06,5,Hut,"Uvero Alto - Bavaro - Punta Cana, Dominican Re...",Uvero Alto - Bavaro - Punta Cana,Dominican Republic,
656,"Apartment in Tambon Patong, Thailand",SUNSET SEAVIEW DELUXE PATONG - POOL+TERRACE,Jun 25 - 30,64.0,2 beds,4.8,159,2018-06-25,2018-06-30,5,Apartment,"Tambon Patong, Thailand",Tambon Patong,Thailand,
206,"Tiny home in Key Largo, Florida, US","Oceanfront Lookout Point w/two Kayaks,Beach & ...",Aug 12 - 17,254.0,1 queen bed,4.86,90,2018-08-12,2018-08-17,5,Tiny home,"Key Largo, Florida, US",Key Largo,US,Florida
357,"Room in Borgo Valbelluna, Italy",Bavarian room in the Dolomites,Jun 1 - 6,84.0,1 sofa bed,4.8,5,2018-06-01,2018-06-06,5,Room,"Borgo Valbelluna, Italy",Borgo Valbelluna,Italy,


In [26]:
df1 = df1.drop(["others"], axis=1)

In [27]:
# The country column were there are missing data were from the US so that was fixed

mask = df1['Country'].isna()
df1.loc[ mask,'Country'] = 'US'


### Ocupancy 

In [28]:
df1['Ocupancy'] = df1['Number of bed'].str.split().str[0]
df1['Bed type'] = df1['Number of bed'].str.split().str[1]
df1.sample(5)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,City,Country,State,Ocupancy,Bed type
610,"Room in Phuket, Thailand",Grand Seaview Pool Suite •,Jun 1 - 6,111.0,2 beds,4.85,26,2018-06-01,2018-06-06,5,Room,Phuket,Thailand,,2,beds
938,Loft in Brooklyn,Authentic Luxury Designed Loft,May 1 - 6,178.0,3 beds,4.88,253,2018-05-01,2018-05-06,5,Loft,Brooklyn,US,,3,beds
231,"Home in Avalon, California, US","Tropical Island Escape w/ Deck, Walk to Avalon...",Jun 19 - 24,336.0,1 queen bed,4.34,80,2018-06-19,2018-06-24,5,Home,Avalon,US,California,1,queen
431,"Home in Jumilhac-le-Grand, France",Bettyjems Stage Refuge,Jun 21 - 28,20.0,4 beds,,0,2018-06-21,2018-06-28,7,Home,Jumilhac-le-Grand,France,,4,beds
723,Place to stay in Shippensburg,A- Frame Cabin,May 1 - 6,273.0,2 queen beds,5.0,168,2018-05-01,2018-05-06,5,Place to stay,Shippensburg,US,,2,queen


In [29]:
# check the data where where bed type is beds we replace it with bed 
df1.loc[df1['Bed type'] == 'beds', 'Bed type'] = 'bed'
df1.sample(5)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,City,Country,State,Ocupancy,Bed type
610,"Room in Phuket, Thailand",Grand Seaview Pool Suite •,Jun 1 - 6,111.0,2 beds,4.85,26,2018-06-01,2018-06-06,5,Room,Phuket,Thailand,,2,bed
614,"Home in Lembang, Indonesia",Minimax House by wiandra,Jun 3 - 8,96.0,4 beds,4.84,81,2018-06-03,2018-06-08,5,Home,Lembang,Indonesia,,4,bed
84,"Loft in The Blue Mountains, Canada",Chateau Ridge Chalet w/ Mountain View,Jun 5 - 12,154.0,4 beds,4.84,95,2018-06-05,2018-06-12,7,Loft,The Blue Mountains,Canada,,4,bed
908,Home in Netarts,Retro Retreat by the Bay,May 1 - 6,136.0,4 beds,4.9,201,2018-05-01,2018-05-06,5,Home,Netarts,US,,4,bed
727,Yurt in Rising Fawn,Cherry Blossom Yurt on Lookout Mountain,May 1 - 6,157.0,1 king bed,4.97,666,2018-05-01,2018-05-06,5,Yurt,Rising Fawn,US,,1,king


In [30]:
df1['Bed type'].unique()

array(['bed', 'queen', 'double', 'king', 'sofa', 'single', 'bunk'],
      dtype=object)

In [31]:
df1.sort_values('Price(in dollar)', ascending=False).head(20)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,City,Country,State,Ocupancy,Bed type
191,"Farm stay in Saint Marys, Canada",Cozy Farm Stay Getaway,Jun 11 - 16,99.0,2 double beds,4.86,183,2018-06-11,2018-06-16,5,Farm stay,Saint Marys,Canada,,2,double
897,Condo in Fort Walton Beach,"PERFECT VIEW!! Waterfront, Marina, and 2 Pools!",May 4 - 9,99.0,2 queen beds,4.98,125,2018-05-04,2018-05-09,5,Condo,Fort Walton Beach,US,,2,queen
215,"Guesthouse in Naalehu, Hawaii, US",The Ohana,Jun 23 - 28,99.0,1 queen bed,4.86,182,2018-06-23,2018-06-28,5,Guesthouse,Naalehu,US,Hawaii,1,queen
219,"Guesthouse in Naalehu, Hawaii, US",The Ohana,Jun 23 - 28,99.0,1 queen bed,4.86,182,2018-06-23,2018-06-28,5,Guesthouse,Naalehu,US,Hawaii,1,queen
177,"Farm stay in Saint Marys, Canada",Cozy Farm Stay Getaway,Jun 11 - 16,99.0,2 queen beds,4.86,183,2018-06-11,2018-06-16,5,Farm stay,Saint Marys,Canada,,2,queen
496,"Villa in Koh Yao Noi, Thailand","Eagles Nest, Luxury Villa, Koh Yao Noi",Jun 1 - 6,986.0,22 beds,4.95,43,2018-06-01,2018-06-06,5,Villa,Koh Yao Noi,Thailand,,22,bed
432,"Cabin in Rosenthal-Bielatal, Germany",Mountain hut in the Elbe sandstone with log ca...,Jun 4 - 9,98.0,4 beds,4.78,169,2018-06-04,2018-06-09,5,Cabin,Rosenthal-Bielatal,Germany,,4,bed
421,"Treehouse in Verson, France",2 Cabins perched above a pond!,Jun 4 - 10,98.0,4 double beds,4.94,530,2018-06-04,2018-06-10,6,Treehouse,Verson,France,,4,double
765,Home in Summertown,Water side Cozy cabin,May 1 - 6,98.0,4 beds,4.91,318,2018-05-01,2018-05-06,5,Home,Summertown,US,,4,bed
319,"Treehouse in Fountana, Greece",Nasos' Treehouse,Jun 1 - 6,98.0,2 single beds,4.81,98,2018-06-01,2018-06-06,5,Treehouse,Fountana,Greece,,2,single


In [32]:
df1['Ocupancy'] = df1['Ocupancy'].astype(int)

In [33]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Title              953 non-null    object        
 1   Detail             953 non-null    object        
 2   Date               953 non-null    object        
 3   Price(in dollar)   953 non-null    object        
 4   Number of bed      953 non-null    object        
 5   Rating             931 non-null    float64       
 6   Number_of_reviews  953 non-null    int32         
 7   Start Date         953 non-null    datetime64[ns]
 8   New End Date       953 non-null    datetime64[ns]
 9   Date Difference    953 non-null    int64         
 10  Facility           953 non-null    object        
 11  City               953 non-null    object        
 12  Country            953 non-null    object        
 13  State              702 non-null    object        
 14  Ocupancy  

In [34]:
df1['Bed type'].unique()

array(['bed', 'queen', 'double', 'king', 'sofa', 'single', 'bunk'],
      dtype=object)

### Converting the Price to the right data dytpe 

In [35]:
df1['Price(in dollar)'] = df1['Price(in dollar)'].str.replace(',','').astype(float)

In [36]:
df1.isnull().sum()

Title                  0
Detail                 0
Date                   0
Price(in dollar)       0
Number of bed          0
Rating                22
Number_of_reviews      0
Start Date             0
New End Date           0
Date Difference        0
Facility               0
City                   0
Country                0
State                251
Ocupancy               0
Bed type               0
dtype: int64

In [37]:
missing_state = df1[df1['State'].isnull()]
missing_state.head(5)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,City,Country,State,Ocupancy,Bed type
702,Tiny home in Millersburg,Forest Haven - Otium,May 1 - 6,224.0,1 queen bed,4.85,218,2018-05-01,2018-05-06,5,Tiny home,Millersburg,US,,1,queen
703,Treehouse in Crane Hill,WANDERLUST TREEHOUSE Book an experience!,May 1 - 6,402.0,1 queen bed,4.99,271,2018-05-01,2018-05-06,5,Treehouse,Crane Hill,US,,1,queen
704,Home in Telluride,Architectural Masterpiece | Best View in Tellu...,May 1 - 6,732.0,5 beds,4.97,76,2018-05-01,2018-05-06,5,Home,Telluride,US,,5,bed
705,Cabin in West Farmington,The Triangle: A-Frame Cabin for your city retreat,May 1 - 6,123.0,4 beds,4.91,515,2018-05-01,2018-05-06,5,Cabin,West Farmington,US,,4,bed
706,Cabin in Crosby,Cozy Louise Cabin - Direct Cuyuna MTB Access/S...,May 1 - 6,225.0,2 queen beds,5.0,18,2018-05-01,2018-05-06,5,Cabin,Crosby,US,,2,queen


In [38]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Title              953 non-null    object        
 1   Detail             953 non-null    object        
 2   Date               953 non-null    object        
 3   Price(in dollar)   953 non-null    float64       
 4   Number of bed      953 non-null    object        
 5   Rating             931 non-null    float64       
 6   Number_of_reviews  953 non-null    int32         
 7   Start Date         953 non-null    datetime64[ns]
 8   New End Date       953 non-null    datetime64[ns]
 9   Date Difference    953 non-null    int64         
 10  Facility           953 non-null    object        
 11  City               953 non-null    object        
 12  Country            953 non-null    object        
 13  State              702 non-null    object        
 14  Ocupancy  

# Eploratory Analysis 

In [39]:
# Country with the most visited facility
top10_country = df1['Country'].value_counts().head(10)
top10_country

US             366
Indonesia      110
Thailand        69
Canada          44
Mexico          40
Philippines     37
Italy           34
UK              34
Malaysia        26
France          25
Name: Country, dtype: int64

In [40]:

top5_country = df1['Country'].value_counts().head(5)
list_top5 = list(top5_country.index)


In [41]:
dftop5_country = df1[df1['Country'].isin(list_top5)]
dftop5_country.sample(2)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,City,Country,State,Ocupancy,Bed type
247,"Room in Wilmington, New York, US",My Adirondack Home - Very Scenic,Jun 5 - 10,107.0,1 double bed,4.96,520,2018-06-05,2018-06-10,5,Room,Wilmington,US,New York,1,double
3,"Home in Blue Ridge, Georgia, US",*Summer Sizzle* 5 Min to Blue Ridge* Pets* Hot...,Jun 11 - 16,192.0,5 beds,4.94,88,2018-06-11,2018-06-16,5,Home,Blue Ridge,US,Georgia,5,bed


In [42]:
# Here i created a new column called top5 with dummy variables.

df1['Top5_countries'] = df1['Country'].apply(lambda x: 1 if x in list_top5 else 0)
df1.sample(5)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,City,Country,State,Ocupancy,Bed type,Top5_countries
550,"Place to stay in Tabanan, Indonesia",Ocean View and Private Pool at Balian Beach,Jul 23 - 28,116.0,1 queen bed,4.82,122,2018-07-23,2018-07-28,5,Place to stay,Tabanan,Indonesia,,1,queen,1
805,Condo in Panama City Beach,300 STEPS TO THE BEACH-BEST 1 BEDROOM DEAL IN ...,May 1 - 6,169.0,2 beds,4.68,57,2018-05-01,2018-05-06,5,Condo,Panama City Beach,US,,2,bed,1
274,"Hut in Oxfordshire, UK",Rectory Farm Hideaway,Aug 28 - Sep 2,178.0,2 double beds,4.93,535,2018-08-28,2018-09-02,5,Hut,Oxfordshire,UK,,2,double,0
485,"Boutique hotel in Ko Samui, Thailand",Pool Villa by Humble,Jun 15 - 20,63.0,1 king bed,4.93,105,2018-06-15,2018-06-20,5,Boutique hotel,Ko Samui,Thailand,,1,king,1
12,"Dome in Joshua Tree, California, US",Dome in the Desert in Joshua Tree,Jun 1 - 6,268.0,1 king bed,4.94,944,2018-06-01,2018-06-06,5,Dome,Joshua Tree,US,California,1,king,1


In [43]:
premium_qualities = ['view', 'sky', 'ocean', 'pool', 'river', 'private', 'island', 'beach', 'luxury', 'villa', 'beachfront', 'oasis', 'water', 'lake']
df1['Premium qualities'] = (df1['Detail'].str.lower().str.contains('|'.join(premium_qualities), case=False, regex=True) | df1['Title'].str.lower().str.contains('|'.join(premium_qualities), case=False, regex=True)).astype(int)


In [44]:
df1.sample(20)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,City,Country,State,Ocupancy,Bed type,Top5_countries,Premium qualities
230,"Condo in Luquillo, Puerto Rico",AHHH-MAZING VIEWS! Beach Front Apt.,Sep 6 - 12,121.0,2 beds,4.88,490,2018-09-06,2018-09-12,6,Condo,Luquillo,Puerto Rico,,2,bed,0,1
773,Home in Orlando,The Orlando Retreat.,May 1 - 6,230.0,4 beds,4.95,78,2018-05-01,2018-05-06,5,Home,Orlando,US,,4,bed,1,0
560,"Home in Kecamatan Selat, Indonesia",The Nude House,Jul 10 - 15,305.0,2 beds,4.68,19,2018-07-10,2018-07-15,5,Home,Kecamatan Selat,Indonesia,,2,bed,1,0
325,"Cave in Mesaria, Greece",Luxury Triple cave room,Jun 12 - 17,129.0,3 beds,4.72,32,2018-06-12,2018-06-17,5,Cave,Mesaria,Greece,,3,bed,0,1
168,"Apartment in Gordonville, Pennsylvania, US",“What a wonderful world” - Modern Farmhouse,Jun 4 - 9,165.0,1 queen bed,4.87,478,2018-06-04,2018-06-09,5,Apartment,Gordonville,US,Pennsylvania,1,queen,1,0
852,Guesthouse in Los Angeles,Private Loft In the Hollywood Hills (by Univer...,May 1 - 6,132.0,1 queen bed,4.87,981,2018-05-01,2018-05-06,5,Guesthouse,Los Angeles,US,,1,queen,1,1
373,"Guesthouse in Jonstorp, Sweden",Gäststuga Knorrasjön,Jun 12 - 17,95.0,2 double beds,,0,2018-06-12,2018-06-17,5,Guesthouse,Jonstorp,Sweden,,2,double,0,0
547,"Place to stay in Kecamatan Kuta Selatan, Indon...",Gypsea Bali Bungalow with private bathroom and AC,Sep 11 - 16,155.0,1 king bed,4.92,78,2018-09-11,2018-09-16,5,Place to stay,Kecamatan Kuta Selatan,Indonesia,,1,king,1,1
408,"Cabin in County Wicklow, Ireland",House with breathtaking views!,Jul 16 - 21,222.0,3 beds,4.97,104,2018-07-16,2018-07-21,5,Cabin,County Wicklow,Ireland,,3,bed,0,1
832,Cottage in Islamorada,1 BD PRIVATE HOUSEBOAT/ COTTAGE TARPON,May 1 - 6,229.0,3 beds,4.85,74,2018-05-01,2018-05-06,5,Cottage,Islamorada,US,,3,bed,1,1


In [45]:
premium_bed = ['queen', 'double', 'king']
df1['luxury bed'] = df1['Bed type'].isin(premium_bed).astype(int)
df1.sample(20)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,City,Country,State,Ocupancy,Bed type,Top5_countries,Premium qualities,luxury bed
195,"Cabin in Woodford, Vermont, US",400+ Airbnb Visits: Incredible Mountain Cabin,Jun 4 - 9,137.0,2 double beds,4.85,325,2018-06-04,2018-06-09,5,Cabin,Woodford,US,Vermont,2,double,1,0,1
519,"Hotel in Kecamatan Ubud, Indonesia",Stunning Room Near Ubud Market - Outdoor Pool!!,Sep 14 - 20,44.0,1 bed,4.78,358,2018-09-14,2018-09-20,6,Hotel,Kecamatan Ubud,Indonesia,,1,bed,1,1,0
856,Cottage in Rockaway Beach,Storybook Beach Cottage Twin Rocks,May 1 - 6,144.0,4 beds,4.88,259,2018-05-01,2018-05-06,5,Cottage,Rockaway Beach,US,,4,bed,1,1,0
20,"Treehouse in San Carlos, Costa Rica",Rainforest Tree House with Hot Springs,Aug 31 - Sep 5,132.0,3 beds,4.84,763,2018-08-31,2018-09-05,5,Treehouse,San Carlos,Costa Rica,,3,bed,0,0,0
155,"Room in Acapulco, Mexico",Beach crab.,Jun 1 - 6,43.0,3 beds,4.74,270,2018-06-01,2018-06-06,5,Room,Acapulco,Mexico,,3,bed,1,1,0
69,"Guesthouse in Peachland, Canada",Romantic lakefront estate with lake view & deck,Jun 2 - 7,271.0,2 beds,5.0,311,2018-06-02,2018-06-07,5,Guesthouse,Peachland,Canada,,2,bed,1,1,0
765,Home in Summertown,Water side Cozy cabin,May 1 - 6,98.0,4 beds,4.91,318,2018-05-01,2018-05-06,5,Home,Summertown,US,,4,bed,1,1,0
150,"Farm stay in Siler City, North Carolina, US",Lovely farm cabin experience,Jun 11 - 16,114.0,2 beds,5.0,5,2018-06-11,2018-06-16,5,Farm stay,Siler City,US,North Carolina,2,bed,1,0,0
441,"Vacation home in Rocchetta Nervina, Italy",The Loggia,Jun 2 - 7,169.0,1 double bed,4.85,61,2018-06-02,2018-06-07,5,Vacation home,Rocchetta Nervina,Italy,,1,double,0,0,1
406,"Home in Vaiano, Italy",Casale La Quercia - Tuscany country house,Jun 21 - 26,111.0,2 beds,4.94,530,2018-06-21,2018-06-26,5,Home,Vaiano,Italy,,2,bed,0,0,0


In [46]:
# This was to create a dummy variable for most expensive countries

top_expensive_country = [ 'italy' , 'australia', 'uk', 'us' ,'bhutan', 'japan','singapore' ,'sweden' ,'denmark', 'iceland', 'norway', 'switzerland' ,'canada']

df1['expensive_country'] = df1['Country'].str.lower().isin(top_expensive_country).astype(int)
df1.sample(20)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,City,Country,State,Ocupancy,Bed type,Top5_countries,Premium qualities,luxury bed,expensive_country
690,"Apartment in Tambon Patong, Thailand",1 BR Phuket Modern Luxury Living,Jun 6 - 11,44.0,1 king bed,4.68,202,2018-06-06,2018-06-11,5,Apartment,Tambon Patong,Thailand,,1,king,1,1,1,0
731,Home in Austin,Enjoy the Heated Pool at a Beautifully Designe...,May 1 - 6,213.0,1 king bed,4.96,853,2018-05-01,2018-05-06,5,Home,Austin,US,,1,king,1,1,1,1
250,"Room in Cancún, Mexico","Surf, family & friends! Tropical Room in downtown",Jun 3 - 8,51.0,1 double bed,4.95,86,2018-06-03,2018-06-08,5,Room,Cancún,Mexico,,1,double,1,0,1,0
238,"Room in Kissimmee, Florida, US",Disney/Universal Home Away from Home!,Aug 20 - 25,162.0,1 king bed,4.95,221,2018-08-20,2018-08-25,5,Room,Kissimmee,US,Florida,1,king,1,0,1,1
757,Cabin in Moab,Basic Cabin #1 - Sleeps 4 - OK-C1,May 6 - 11,125.0,3 beds,4.86,233,2018-05-06,2018-05-11,5,Cabin,Moab,US,,3,bed,1,0,0,1
234,"Apartment in Miami Beach, Florida, US","Modern 1/1, Half a Block from The Ocean",Jun 12 - 17,140.0,2 beds,4.75,308,2018-06-12,2018-06-17,5,Apartment,Miami Beach,US,Florida,2,bed,1,1,0,1
834,Hotel in Kissimmee,Modern room in boutique hotel near amusement p...,May 1 - 6,64.0,2 beds,4.81,54,2018-05-01,2018-05-06,5,Hotel,Kissimmee,US,,2,bed,1,0,0,1
798,Apartment in Santa Monica,Venice Canals Sanctuary,May 1 - 6,374.0,2 beds,4.93,107,2018-05-01,2018-05-06,5,Apartment,Santa Monica,US,,2,bed,1,0,0,1
732,Cabin in Livingston,10-Acre Yellowstone Cabin w/Stunning Mtn View,May 1 - 6,222.0,4 beds,4.93,101,2018-05-01,2018-05-06,5,Cabin,Livingston,US,,4,bed,1,1,0,1
820,Apartment in Sarasota,Private Bay area home !,May 1 - 6,117.0,3 beds,4.67,3,2018-05-01,2018-05-06,5,Apartment,Sarasota,US,,3,bed,1,1,0,1


# Machine learning 

In [47]:
df1.head(1)

Unnamed: 0,Title,Detail,Date,Price(in dollar),Number of bed,Rating,Number_of_reviews,Start Date,New End Date,Date Difference,Facility,City,Country,State,Ocupancy,Bed type,Top5_countries,Premium qualities,luxury bed,expensive_country
0,"Chalet in Skykomish, Washington, US",Sky Haus - A-Frame Cabin,Jun 11 - 16,306.0,4 beds,4.85,531,2018-06-11,2018-06-16,5,Chalet,Skykomish,US,Washington,4,bed,1,1,0,1


In [48]:
df2 = df1.drop(['Title', 'State', 'Detail', 'Date', 'Number of bed', 'Start Date', 'New End Date', 'Facility', 'City', 'Country', 'Bed type'], axis=1)
df2.sample (10)

Unnamed: 0,Price(in dollar),Rating,Number_of_reviews,Date Difference,Ocupancy,Top5_countries,Premium qualities,luxury bed,expensive_country
244,115.0,4.52,92,5,1,1,1,1,0
938,178.0,4.88,253,5,3,1,1,0,1
853,260.0,5.0,19,5,2,1,0,0,1
783,246.0,4.78,9,5,1,1,1,0,1
591,26.0,4.27,56,5,3,0,1,0,0
431,20.0,,0,7,4,0,0,0,0
219,99.0,4.86,182,5,1,1,0,1,1
312,69.0,4.94,84,5,3,0,0,0,1
781,82.0,4.2,148,5,2,1,0,0,1
478,122.0,4.57,115,5,2,1,1,1,0


In [49]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Price(in dollar)   953 non-null    float64
 1   Rating             931 non-null    float64
 2   Number_of_reviews  953 non-null    int32  
 3   Date Difference    953 non-null    int64  
 4   Ocupancy           953 non-null    int32  
 5   Top5_countries     953 non-null    int64  
 6   Premium qualities  953 non-null    int32  
 7   luxury bed         953 non-null    int32  
 8   expensive_country  953 non-null    int32  
dtypes: float64(2), int32(5), int64(2)
memory usage: 48.5 KB


In [50]:
dependent_variable = 'Price(in dollar)'

In [51]:
independent_variable = df2.columns.tolist()

In [52]:
independent_variable.remove(dependent_variable)

In [53]:
independent_variable

['Rating',
 'Number_of_reviews',
 'Date Difference',
 'Ocupancy',
 'Top5_countries',
 'Premium qualities',
 'luxury bed',
 'expensive_country']

### Creating the data set for X and Y

In [54]:
X = df2[independent_variable].values
y = df2[dependent_variable].values

In [55]:
# spliting the data set into training and test 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [56]:
# Transforming the data 
imputer = SimpleImputer(strategy='mean')
scaler = MinMaxScaler()

X_train = imputer.fit_transform(X_train)
X_train = scaler.fit_transform(X_train)

X_test = imputer.transform(X_test)
X_test = scaler.transform(X_test)

In [57]:
# This is to see the forst 10 rows of the training data

X_train[0:10]

array([[0.93984962, 0.36077482, 0.        , 0.04761905, 0.        ,
        0.        , 0.        , 0.        ],
       [0.93984962, 0.35270379, 0.        , 0.23809524, 0.        ,
        1.        , 0.        , 0.        ],
       [0.94736842, 0.02421308, 0.        , 0.        , 1.        ,
        1.        , 0.        , 1.        ],
       [0.7443609 , 0.05488297, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.79699248, 0.20096852, 0.        , 0.0952381 , 1.        ,
        1.        , 0.        , 1.        ],
       [1.        , 0.07909605, 0.        , 0.        , 1.        ,
        1.        , 1.        , 0.        ],
       [0.94736842, 0.01129944, 0.        , 0.0952381 , 1.        ,
        1.        , 0.        , 1.        ],
       [0.58646617, 0.15092817, 0.        , 0.0952381 , 0.        ,
        1.        , 0.        , 0.        ],
       [0.92481203, 0.13397902, 1.        , 0.04761905, 0.        ,
        0.        , 1.      

In [58]:
# Fitting the mult. linear regression to the Training data

regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [59]:
y_pred = regressor.predict(X_test)

In [60]:
math.sqrt(mean_squared_error(y_test, y_pred))

153.19209776723582

In [61]:
r2_score(y_test, y_pred)

0.16839627678152502

In [62]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Create a Random Forest regressor object
rf_regressor = RandomForestRegressor()

# Fit the model to the training data
rf_regressor.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_regressor.predict(X_test)

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)

print("R-squared score:", r2)

R-squared score: 0.21515338635478365


In [63]:
from sklearn.model_selection import train_test_split

features_list = ['Top5_countries', 'Ocupancy', 'Top5_countries', 'Premium qualities', 'luxury bed', 'expensive_country']
outcome = 'Price(in dollar)'

y = df1[outcome]
X = df1[features_list]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

from sklearn.linear_model import LinearRegression
reg = LinearRegression()

reg.fit(X, y)

reg.score(X_test, y_test)

0.15522931015720776