# Project 3: Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Dataframe preparation

In [2]:
hotels = pd.read_csv('data/hotels.csv')
hotels.head()

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,4,7,10.0,"[' Leisure trip ', ' Couple ', ' Studio Suite ...",531 day,51.507894,-0.143671
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,2,14,6.3,"[' Business trip ', ' Couple ', ' Standard Dou...",203 day,51.521009,-0.123097
2,151 bis Rue de Rennes 6th arr 75006 Paris France,32,10/18/2016,8.9,Legend Saint Germain by Elegancia,China,No kettle in room,6,406,No Positive,0,14,7.5,"[' Leisure trip ', ' Solo traveler ', ' Modern...",289 day,48.845377,2.325643
3,216 Avenue Jean Jaures 19th arr 75019 Paris Fr...,34,9/22/2015,7.5,Mercure Paris 19 Philharmonie La Villette,United Kingdom,No Negative,0,607,Friendly staff quiet comfortable room spotles...,11,8,10.0,"[' Leisure trip ', ' Solo traveler ', ' Standa...",681 day,48.888697,2.39454
4,Molenwerf 1 1014 AG Amsterdam Netherlands,914,3/5/2016,8.5,Golden Tulip Amsterdam West,Poland,Torn sheets,4,7586,The staff was very friendly and helpful Break...,20,10,9.6,"[' Business trip ', ' Couple ', ' Standard Dou...",516 day,52.385601,4.84706


In [3]:
# X - features for training, y - target (hotel rating) 
X = hotels.drop(['reviewer_score'], axis = 1)
y = hotels['reviewer_score']

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 16 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               386803 non-null  object 
 1   additional_number_of_scoring                386803 non-null  int64  
 2   review_date                                 386803 non-null  object 
 3   average_score                               386803 non-null  float64
 4   hotel_name                                  386803 non-null  object 
 5   reviewer_nationality                        386803 non-null  object 
 6   negative_review                             386803 non-null  object 
 7   review_total_negative_word_counts           386803 non-null  int64  
 8   total_number_of_reviews                     386803 non-null  int64  
 9   positive_review                             386803 non-null  object 
 

## Feature engineering

### Hotel Address

Extract information about city

In [5]:
X['city'] = X['hotel_address'].apply(lambda x: x.split()[-5] if 'United Kingdom' in x \
    else x.split()[-2])
X['city'].value_counts(True) * 100

city
London       50.871891
Barcelona    11.667955
Paris        11.589879
Amsterdam    11.118321
Vienna        7.543375
Milan         7.208579
Name: proportion, dtype: float64

Encoding

In [6]:
import category_encoders as ce

In [7]:
encoder = ce.OneHotEncoder(cols=['city'], use_cat_names=True)
city_bin = encoder.fit_transform(X['city'])
X = pd.concat([X, city_bin], axis=1)

### Review date

Convert type of data to datetime

In [8]:
X['review_date'] = pd.to_datetime(X['review_date'])
X['review_date'].head()

0   2016-02-19
1   2017-01-12
2   2016-10-18
3   2015-09-22
4   2016-03-05
Name: review_date, dtype: datetime64[ns]

Extract information about year, month, day of week, day of month

In [9]:
X['review_year'] = X['review_date'].dt.year
X['review_month'] = X['review_date'].dt.month
X['review_day_week'] = X['review_date'].dt.day_of_week
X['review_day_month'] = X['review_date'].dt.day
X[['review_year', 'review_month', 'review_day_week', 'review_day_month']].head()

Unnamed: 0,review_year,review_month,review_day_week,review_day_month
0,2016,2,4,19
1,2017,1,3,12
2,2016,10,1,18
3,2015,9,1,22
4,2016,3,5,5


### Reviewer nationality

In [10]:
X['reviewer_nationality'].nunique()

225

In [11]:
X['reviewer_nationality'].value_counts(True) * 100

reviewer_nationality
 United Kingdom               47.577966
 United States of America      6.861632
 Australia                     4.198261
 Ireland                       2.876400
 United Arab Emirates          1.969219
                                ...    
 Cook Islands                  0.000259
 Guinea                        0.000259
 Comoros                       0.000259
 Anguilla                      0.000259
 Grenada                       0.000259
Name: proportion, Length: 225, dtype: float64

Create new features representing most common nationalities

In [12]:
X['reviewer_from_UK'] = X['reviewer_nationality'].apply(lambda x: 1 if x == 'United Kingdom' else 0)

### Hotel name

In [13]:
X['hotel_name'].nunique()

1492

Hotel names are completely unique, not much information can be extracted here.

### Negative review

In [14]:
X['negative_review'].nunique()

248828

Some preprocessing operations

In [15]:
# Delete redundant spaces and convert text to lowercase
X['negative_review'] = X['negative_review'].apply(lambda x: x.strip(' ').lower())

In [16]:
# Deal with 's and 't wrong interpretation
X['negative_review'] = X['negative_review'].apply(lambda x: x.replace(' t ', 't '))
X['negative_review'] = X['negative_review'].apply(lambda x: x.replace(' s ', 's '))

In [17]:
# Word counter in negative reviews
X['neg_review_wc'] = X['negative_review'].apply(lambda x: len(re.findall(r"[\w']+", x)))
X[['negative_review', 'neg_review_wc']].head()

Unnamed: 0,negative_review,neg_review_wc
0,leaving,1
1,poor breakfast,2
2,no kettle in room,4
3,no negative,2
4,torn sheets,2


In [18]:
fig_neg_wc = px.histogram(
    X['neg_review_wc'],
    height=600,
    width=1000,
    title="Number of words in negative reviews"
)
fig_neg_wc.update_layout(showlegend=False)
fig_neg_wc.update_yaxes(minor_showgrid=True)
fig_neg_wc.layout.xaxis.title.text = 'Number of words'
fig_neg_wc.layout.yaxis.title.text = 'Frequency'
fig_neg_wc.write_html('images/neg_wc.html')
fig_neg_wc.write_image('images/neg_wc.png')
#fig_neg_wc.show()

Overwhelming number of reviews consists of very short number of words. Such reviews usually contain no negative content at all. Those of reviews which have long duration typically describe some features displeasing the reviewer.

In [19]:
neg_review_list = X['negative_review'].apply(lambda x: x.split())

The main idea is to match 20 most popular words in negative reviews that can affect the hotel rating.

In [20]:
neg_review_list.head()

0                 [leaving]
1         [poor, breakfast]
2    [no, kettle, in, room]
3            [no, negative]
4            [torn, sheets]
Name: negative_review, dtype: object

In [21]:
neg_words_list = neg_review_list.explode()

In [22]:
neg_words_list.head()

0      leaving
1         poor
1    breakfast
2           no
2       kettle
Name: negative_review, dtype: object

In [23]:
neg_words_cnt = pd.value_counts(neg_words_list)
neg_words_cnt.head(20)

negative_review
the         398192
was         177297
a           172589
to          171717
and         164233
no          148333
room        131762
in          126317
negative     97092
not          94356
i            91828
of           90435
for          88102
we           74440
it           73800
is           61684
very         60354
but          56657
hotel        55303
on           51545
Name: count, dtype: int64

In [24]:
neg_words_cnt.drop(['the', 'was', 'a', 'to', 'and', 'no', 'in', 'negative', 'not', 'i', 'of', 'for', 'it', 'we', 'is', \
    'very', 'but', 't'], inplace=True)

In [25]:
neg_words_cnt.head(20)

negative_review
room         131762
hotel         55303
on            51545
were          46211
at            45899
had           45187
breakfast     43514
that          42439
have          41054
with          38967
small         37369
be            33941
there         33833
as            33012
they          31853
you           30422
from          30079
this          29789
staff         29605
so            29575
Name: count, dtype: int64

In [26]:
neg_words_cnt.drop(['hotel', 'on', 'were', 'at', 'had', 'that', 'have', 'with', 'there', 'be', 'as', 'they', \
    'you', 'from', 'this', 'so'], inplace=True)

In [27]:
neg_words_cnt.head(20)

negative_review
room         131762
breakfast     43514
small         37369
staff         29605
my            29134
nothing       29011
rooms         26424
our           25498
would         24315
could         24015
when          23213
bed           22270
are           21973
all           21761
too           21585
didnt         21438
only          21034
one           20993
bit           20583
out           20232
Name: count, dtype: int64

In [28]:
neg_words_cnt.drop(['my', 'rooms', 'our', 'would', 'could', 'when', 'are', 'all', 'too', 'one', 'only', 'bit', 'out'], inplace=True)

In [29]:
neg_words_cnt.head(20)

negative_review
room         131762
breakfast     43514
small         37369
staff         29605
nothing       29011
bed           22270
didnt         21438
bathroom      19976
which         19549
night         17828
like          16799
little        16779
or            16696
if            16059
shower        15956
good          15577
an            15566
been          15328
us            15321
more          15236
Name: count, dtype: int64

In [30]:
neg_words_cnt.drop(['didnt', 'which', 'night', 'or', 'if', 'an', 'been', 'us', 'more'], inplace=True)

In [31]:
neg_words_cnt.head(20)

negative_review
room         131762
breakfast     43514
small         37369
staff         29605
nothing       29011
bed           22270
bathroom      19976
like          16799
little        16779
shower        15956
good          15577
did           15097
get           14707
up            14690
service       14513
bar           14414
me            13811
stay          13158
time          13125
expensive     12510
Name: count, dtype: int64

In [32]:
neg_words_cnt.drop(['did', 'get', 'up', 'me', 'stay', 'time'], inplace=True)

In [33]:
neg_words_cnt.head(20)

negative_review
room         131762
breakfast     43514
small         37369
staff         29605
nothing       29011
bed           22270
bathroom      19976
like          16799
little        16779
shower        15956
good          15577
service       14513
bar           14414
expensive     12510
reception     12509
really        12378
also          12373
just          12282
some          12275
poor          12103
Name: count, dtype: int64

In [34]:
neg_words_cnt.drop(['really', 'also', 'just', 'some'], inplace=True)

In [35]:
neg_words_cnt.head(20)

negative_review
room         131762
breakfast     43514
small         37369
staff         29605
nothing       29011
bed           22270
bathroom      19976
like          16799
little        16779
shower        15956
good          15577
service       14513
bar           14414
expensive     12510
reception     12509
poor          12103
even          11580
price         11560
by            11540
check         11495
Name: count, dtype: int64

In [36]:
neg_words_cnt.drop(['even', 'by', 'check'], inplace=True)

In [37]:
neg_words_cnt.head(20)

negative_review
room         131762
breakfast     43514
small         37369
staff         29605
nothing       29011
bed           22270
bathroom      19976
like          16799
little        16779
shower        15956
good          15577
service       14513
bar           14414
expensive     12510
reception     12509
poor          12103
price         11560
day           11303
wasnt         11285
floor         11190
Name: count, dtype: int64

In [38]:
neg_words_cnt.drop(['can'], inplace=True)

In [39]:
negative_tags = list(neg_words_cnt[:20].index)

Generate some new features that marks presence or absence of word in review.

In [40]:
for tag in negative_tags:
    X['neg_is_' + tag] = X['negative_review'].apply(lambda x: 1 if tag in x else 0)

### Positive review

In [41]:
X['positive_review'].nunique()

311737

Preprocessing

In [42]:
X['positive_review'] = X['positive_review'].apply(lambda x: x.strip(' ').lower())

In [43]:
X['positive_review'] = X['positive_review'].apply(lambda x: x.replace(' t ', 't '))
X['positive_review'] = X['positive_review'].apply(lambda x: x.replace(' s ', 's '))

Word counter

In [44]:
X['pos_review_wc'] = X['positive_review'].apply(lambda x: len(re.findall(r"[\w']+", x)))
X[['positive_review', 'pos_review_wc']].head()

Unnamed: 0,positive_review,pos_review_wc
0,staff were amazing,3
1,location,1
2,no positive,2
3,friendly staff quiet comfortable room spotless...,9
4,the staff was very friendly and helpful breakf...,18


In [45]:
fig_pos_wc = px.histogram(
    X['pos_review_wc'],
    height=600,
    width=1000,
    title="Number of words in positive reviews"
)
fig_pos_wc.update_layout(showlegend=False)
fig_pos_wc.update_yaxes(minor_showgrid=True)
fig_pos_wc.layout.xaxis.title.text = 'Number of words'
fig_pos_wc.layout.yaxis.title.text = 'Frequency'
fig_pos_wc.write_html('images/pos_wc.html')
fig_pos_wc.write_image('images/pos_wc.png')
#fig_pos_wc.show()

The dependance is more smooth than that of negative reviews. Again most of reviews has very short duration, most of them admits quality of service. Long reviews tend to mark some insignificant flaws that can be fixed further.

In [46]:
pos_review_list = X['positive_review'].apply(lambda x: x.split())

In [47]:
pos_words_list = pos_review_list.explode()

Same concept as in negative reviews.

In [48]:
pos_words_cnt = pd.value_counts(pos_words_list)
pos_words_cnt.head(20)

positive_review
the          385768
and          315786
was          177057
staff        145804
very         144568
location     144555
to           140842
a            123577
room         105632
hotel         93507
in            85660
good          84479
of            79837
great         79156
is            76764
for           68502
were          68308
friendly      63869
breakfast     63297
helpful       57200
Name: count, dtype: int64

In [49]:
pos_words_cnt.drop(['the', 'and', 'was', 'very', 'to', 'a', 'in', 'of', 'is', 'for', 'were', 'we', 'with'], inplace=True)

In [50]:
pos_words_cnt.head(20)

positive_review
staff          145804
location       144555
room           105632
hotel           93507
good            84479
great           79156
friendly        63869
breakfast       63297
helpful         57200
nice            52026
clean           50249
excellent       46758
i               45769
comfortable     44843
it              37819
bed             37446
no              33827
from            32746
rooms           30345
at              27753
Name: count, dtype: int64

In [51]:
pos_words_cnt.drop(['i', 'it', 'no', 'from', 'rooms', 'at', 'on', 'all', 'you', 'are', 'stay', 'our', \
    'but', 'really', 'had', 'this', 'so', 'as', 'everything'], inplace=True)

In [52]:
pos_words_cnt.head(20)

positive_review
staff          145804
location       144555
room           105632
hotel           93507
good            84479
great           79156
friendly        63869
breakfast       63297
helpful         57200
nice            52026
clean           50249
excellent       46758
comfortable     44843
bed             37446
positive        27288
lovely          26269
close           23217
station         21952
perfect         19397
service         19392
Name: count, dtype: int64

In [53]:
pos_words_cnt.rename({'comfortable': 'comf'}, inplace=True)

In [54]:
positive_tags = list(pos_words_cnt[:20].index)

Generate some features marking presence of absence of word in positive review.

In [55]:
for tag in positive_tags:
    X['pos_is_' + tag] = X['positive_review'].apply(lambda x: 1 if tag in x else 0)

### Days since review

Match number of days from the feature.

In [56]:
X['reviewed_days_ago'] = X['days_since_review'].apply(lambda x: int(x.split()[0]))
X['reviewed_days_ago'].value_counts()

reviewed_days_ago
1      1911
322    1738
120    1702
338    1462
534    1451
       ... 
122     154
243     154
615     146
124     127
123     111
Name: count, Length: 731, dtype: int64

### Tags

Some preprocessing operations

In [57]:
# Delete square brackets and single quote then split the string into list of words
X['tags_list'] = X['tags'].apply(lambda x: re.sub('[\[\]\']', '', x).split(', '))
X['tags_list'].head()

0    [ Leisure trip ,  Couple ,  Studio Suite ,  St...
1    [ Business trip ,  Couple ,  Standard Double R...
2    [ Leisure trip ,  Solo traveler ,  Modern Doub...
3    [ Leisure trip ,  Solo traveler ,  Standard Ro...
4    [ Business trip ,  Couple ,  Standard Double o...
Name: tags_list, dtype: object

In [58]:
tags_flat = X['tags_list'].explode()

In [59]:
tag_list = list(tags_flat.value_counts().index)
tag_list

[' Leisure trip ',
 ' Submitted from a mobile device ',
 ' Couple ',
 ' Stayed 1 night ',
 ' Stayed 2 nights ',
 ' Solo traveler ',
 ' Stayed 3 nights ',
 ' Business trip ',
 ' Group ',
 ' Family with young children ',
 ' Stayed 4 nights ',
 ' Double Room ',
 ' Standard Double Room ',
 ' Superior Double Room ',
 ' Family with older children ',
 ' Deluxe Double Room ',
 ' Double or Twin Room ',
 ' Stayed 5 nights ',
 ' Standard Double or Twin Room ',
 ' Classic Double Room ',
 ' Superior Double or Twin Room ',
 ' 2 rooms ',
 ' Stayed 6 nights ',
 ' Standard Twin Room ',
 ' Single Room ',
 ' Twin Room ',
 ' Stayed 7 nights ',
 ' Executive Double Room ',
 ' Classic Double or Twin Room ',
 ' Superior Twin Room ',
 ' Club Double Room ',
 ' Deluxe Double or Twin Room ',
 ' Queen Room ',
 ' Deluxe King Room ',
 ' Superior Queen Room ',
 ' Standard Single Room ',
 ' Junior Suite ',
 ' Triple Room ',
 ' Classic Room ',
 ' Superior Room ',
 ' Superior King Room ',
 ' Standard Room ',
 ' Deluxe R

In [60]:
# Filter tags containing 'night'
night_flat = [item for item in tags_flat if ' night' in item]
pd.value_counts(night_flat)

 Stayed 1 night                                 145373
 Stayed 2 nights                                100263
 Stayed 3 nights                                 72000
 Stayed 4 nights                                 35748
 Stayed 5 nights                                 15611
 Stayed 6 nights                                  7399
 Stayed 7 nights                                  5549
 Stayed 8 nights                                  1910
 Stayed 9 nights                                   966
 Stayed 10 nights                                  663
 Stayed 11 nights                                  306
 Stayed 12 nights                                  217
 Stayed 14 nights                                  184
 Stayed 13 nights                                  174
 Stayed 15 nights                                   87
 Stayed 16 nights                                   38
 Stayed 17 nights                                   27
 Stayed 18 nights                                   24
 Stayed 19

In [61]:
# Filter 'night tags' containing digits
night_flat = list(filter(lambda x: bool(re.search(r'\d', x)), night_flat))
pd.value_counts(night_flat)

 Stayed 1 night                                 145373
 Stayed 2 nights                                100263
 Stayed 3 nights                                 72000
 Stayed 4 nights                                 35748
 Stayed 5 nights                                 15611
 Stayed 6 nights                                  7399
 Stayed 7 nights                                  5549
 Stayed 8 nights                                  1910
 Stayed 9 nights                                   966
 Stayed 10 nights                                  663
 Stayed 11 nights                                  306
 Stayed 12 nights                                  217
 Stayed 14 nights                                  184
 Stayed 13 nights                                  174
 Stayed 15 nights                                   87
 Stayed 16 nights                                   38
 Stayed 17 nights                                   27
 Stayed 18 nights                                   24
 Stayed 19

Number of reviews who hasn't tag indicating number of nights stayed:

In [62]:
X['tags'].shape[0] - pd.value_counts(night_flat).sum()

140

Percent of reviews who hasn't tag indicating number of nights stayed:

In [63]:
round((1 - pd.value_counts(night_flat).sum() / X['tags'].shape[0]) * 100, 2)

0.04

Since we have only 0.04\% of reviews that doesn't indicate number of nights stayed, we can fill missing values.

In [64]:
night_list = list(pd.value_counts(night_flat).index)

In [65]:
def nights_num(feat):
    """Extract number of nights stayed or return np.nan otherwise

    Args:
        feat (pandas.Series): 'nights_stayed' feature

    Returns:
        stay_string (float): number of nights stayed or np.nan
    """
    
    for tag in feat:
        stay_string = np.nan
        
        if tag in night_list:
            stay_string = int(''.join(re.findall(r'\d+', tag)))
            break
    
    return stay_string

In [66]:
X['nights_stayed'] = X['tags_list'].apply(nights_num)
X['nights_stayed'].isna().value_counts()

nights_stayed
False    386657
True        146
Name: count, dtype: int64

In [67]:
fig_nights = px.histogram(
    X['nights_stayed'].dropna().astype(int),
    height=600,
    width=1000,
    title="Nights stayed histogram"
)
fig_nights.update_layout(showlegend=False)
fig_nights.update_yaxes(minor_showgrid=True)
fig_nights.layout.xaxis.title.text = 'Nights stayed'
fig_nights.layout.yaxis.title.text = 'Number of reviews'
fig_nights.write_html('images/nights.html')
fig_nights.write_image('images/nights.png')
#fig_nights.show()

The dependance is obviously far from normal, so missing values can be filled with median values.

In [68]:
X['nights_stayed'] = X['nights_stayed'].fillna(X['nights_stayed'].dropna().median()).astype(int)
X['nights_stayed'].head()

0    2
1    1
2    3
3    1
4    6
Name: nights_stayed, dtype: int64

Same concept as in negative reviews. Match 20 most popular tags.

In [69]:
pop_tags_list = [item for item in tag_list if ' night' not in item][:20]
display(pop_tags_list)

[' Leisure trip ',
 ' Submitted from a mobile device ',
 ' Couple ',
 ' Solo traveler ',
 ' Business trip ',
 ' Group ',
 ' Family with young children ',
 ' Double Room ',
 ' Standard Double Room ',
 ' Superior Double Room ',
 ' Family with older children ',
 ' Deluxe Double Room ',
 ' Double or Twin Room ',
 ' Standard Double or Twin Room ',
 ' Classic Double Room ',
 ' Superior Double or Twin Room ',
 ' 2 rooms ',
 ' Standard Twin Room ',
 ' Single Room ',
 ' Twin Room ']

Generate some features marking presence of absence of tag in tag list of review.

In [70]:
for tag in pop_tags_list:
    X['is' + tag.rstrip()] = X['tags_list'].apply(lambda x: 1 if tag in x else 0)

### Coordinates

In [71]:
from geopy.geocoders import Nominatim

Array of addresses that lack coordinates

In [72]:
coordinates_na = X[X['lng'].isna() | X['lat'].isna()]['hotel_address'].unique()
coordinates_na

array(['Savoyenstra e 2 16 Ottakring 1160 Vienna Austria',
       '23 Rue Damr mont 18th arr 75018 Paris France',
       'Josefst dter Stra e 10 12 08 Josefstadt 1080 Vienna Austria',
       'W hringer Stra e 33 35 09 Alsergrund 1090 Vienna Austria',
       '4 rue de la P pini re 8th arr 75008 Paris France',
       'Sieveringer Stra e 4 19 D bling 1190 Vienna Austria',
       'Taborstra e 8 A 02 Leopoldstadt 1020 Vienna Austria',
       'Bail n 4 6 Eixample 08010 Barcelona Spain',
       'Gr nentorgasse 30 09 Alsergrund 1090 Vienna Austria',
       'Landstra er G rtel 5 03 Landstra e 1030 Vienna Austria',
       'Paragonstra e 1 11 Simmering 1110 Vienna Austria',
       'W hringer Stra e 12 09 Alsergrund 1090 Vienna Austria',
       '20 Rue De La Ga t 14th arr 75014 Paris France',
       'Hasenauerstra e 12 19 D bling 1190 Vienna Austria',
       'Sep lveda 180 Eixample 08011 Barcelona Spain',
       'Pau Clar s 122 Eixample 08009 Barcelona Spain',
       'Josefst dter Stra e 22 08 Jos

Correct some mistakes

In [73]:
X['hotel_address'].replace({
    'Savoyenstra e 2 16 Ottakring 1160 Vienna Austria': 'Savoyenstraße 2, 16. Ottakring, 1160 Vienna, Austria',
    '23 Rue Damr mont 18th arr 75018 Paris France': '23, Rue Damrémont, 18e arrondissement, 75018 Paris, France',
    'Josefst dter Stra e 10 12 08 Josefstadt 1080 Vienna Austria': 'Josefstädter Straße 10-12. Josefstadt, 1080 Vienna, Austria',
    'W hringer Stra e 33 35 09 Alsergrund 1090 Vienna Austria': 'Währinger Straße 33-35. Alsergrund, 1090 Vienna, Austria',
    '4 rue de la P pini re 8th arr 75008 Paris France': '4, rue de la Pépinière, 8e arrondissement, 75008 Paris, France',
    'Sieveringer Stra e 4 19 D bling 1190 Vienna Austria': 'Sieveringer Straße 4, 19. Döbling, 1190 Vienna, Austria',
    'Taborstra e 8 A 02 Leopoldstadt 1020 Vienna Austria': 'Taborstraße 8A. Leopoldstadt, 1020 Vienna, Austria',
    'Bail n 4 6 Eixample 08010 Barcelona Spain': 'Bailén, 4-6, Eixample, 08010 Barcelona, Spain',
    'Gr nentorgasse 30 09 Alsergrund 1090 Vienna Austria': 'Grünentorgasse, 30. Alsergrund, 1090 Vienna, Austria',
    'Landstra er G rtel 5 03 Landstra e 1030 Vienna Austria': 'Landstraßer Gürtel 5. Landstraße, 1030 Vienna, Austria',
    'Paragonstra e 1 11 Simmering 1110 Vienna Austria': 'Paragonstraße 1, 11. Simmering, 1110 Vienna, Austria',
    'W hringer Stra e 12 09 Alsergrund 1090 Vienna Austria': 'Währinger Straße 12. Alsergrund, 1090 Vienna, Austria',
    '20 Rue De La Ga t 14th arr 75014 Paris France': '20, Rue De La Gaîté, 14e arrondissement, 75014 Paris, France',
    'Hasenauerstra e 12 19 D bling 1190 Vienna Austria': 'Hasenauerstraße 12, 19. Döbling, 1190 Vienna, Austria',
    'Sep lveda 180 Eixample 08011 Barcelona Spain': 'Sepúlveda 180, Eixample, 08011 Barcelona, Spain',
    'Pau Clar s 122 Eixample 08009 Barcelona Spain': 'Pau Clarís 122, Eixample, 08009 Barcelona, Spain',
    'Josefst dter Stra e 22 08 Josefstadt 1080 Vienna Austria': 'Josefstädter Straße 22. Josefstadt, 1080 Vienna, Austria'
}, inplace=True)

In [74]:
coordinates_na = list(X[X['lng'].isna() | X['lat'].isna()]['hotel_address'].unique())

Generate dictionary with keys as addresses and values as coordinates:

In [75]:
geolocator = Nominatim(user_agent="hotel_locator")

locations_dict = dict()

for item in coordinates_na:
    location = geolocator.geocode(item)
    locations_dict[item] = [location.latitude, location.longitude]

Fill missing coordinates:

In [76]:
for idx in X['hotel_address'].index:
    if X['hotel_address'].iloc[idx] in coordinates_na:
        X['lat'].iloc[idx] = locations_dict[X['hotel_address'].iloc[idx]][0]
        X['lng'].iloc[idx] = locations_dict[X['hotel_address'].iloc[idx]][1]

## Correlation

Remove categorical features:

In [77]:
object_feats = X.select_dtypes(exclude=[np.number]).columns.values
display(object_feats)

array(['hotel_address', 'review_date', 'hotel_name',
       'reviewer_nationality', 'negative_review', 'positive_review',
       'tags', 'days_since_review', 'city', 'tags_list'], dtype=object)

In [78]:
X.drop(object_feats, axis=1, inplace=True)

In [79]:
X_corr = round(X.corr(), 2)

Features with strong correlation:

In [80]:
X_corr_sorted = X_corr.abs().unstack().sort_values(kind='quicksort', ascending=False)
X_corr_sorted[(X_corr_sorted < 1) & (X_corr_sorted >= 0.7)]

review_year                   reviewed_days_ago               0.92
reviewed_days_ago             review_year                     0.92
is Leisure trip               is Business trip                0.90
is Business trip              is Leisure trip                 0.90
lat                           city_Barcelona                  0.84
city_Barcelona                lat                             0.84
additional_number_of_scoring  total_number_of_reviews         0.82
city_Vienna                   lng                             0.82
total_number_of_reviews       additional_number_of_scoring    0.82
lng                           city_Vienna                     0.82
dtype: float64

In [81]:
X.drop(['review_year', 'is Business trip', 'total_number_of_reviews', 'city_Barcelona', \
               'city_Vienna'], axis=1, inplace=True)

## Feature selection

In [82]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

In [83]:
# Number of features to transfer into the model
feat_num = 30

Most important features:

In [84]:
importance = pd.Series(f_classif(X, y)[0], index=X.columns)
importance.sort_values(inplace=True, ascending=False)

fig_kbest = px.bar(
    importance[:feat_num],
    orientation='v',
    height=600,
    width=1000,
    title='Feature importance'
)
fig_kbest.update_yaxes(minor_showgrid=True)
fig_kbest.update(layout_showlegend=False)
fig_kbest.layout.xaxis.title.text = 'Feature name'
fig_kbest.layout.yaxis.title.text = 'f-statistics value'
fig_kbest.write_html('images/kbest.html')
fig_kbest.write_image('images/kbest.png')
#fig_kbest.show()


Features [14] are constant.


invalid value encountered in divide



Generate dataframe containing most important features:

In [85]:
kbest_classifier = SelectKBest(score_func=f_classif, k=feat_num)
X_kbest = kbest_classifier.fit_transform(X, y)
mask = kbest_classifier.get_support()
kbest_feats = X.columns[mask]
kbest_df = pd.DataFrame(X_kbest, columns=kbest_feats)
kbest_df.head()


Features [14] are constant.


invalid value encountered in divide



Unnamed: 0,average_score,review_total_negative_word_counts,review_total_positive_word_counts,neg_review_wc,neg_is_room,neg_is_breakfast,neg_is_small,neg_is_staff,neg_is_nothing,neg_is_bed,...,pos_is_hotel,pos_is_great,pos_is_friendly,pos_is_helpful,pos_is_excellent,pos_is_comf,pos_is_positive,pos_is_lovely,pos_is_perfect,is Leisure trip
0,8.4,3.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,8.3,3.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.9,6.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,7.5,0.0,11.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,8.5,4.0,20.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
from sklearn.model_selection import train_test_split

In [87]:
# Split the data on test and train samples 
X_train, X_test, y_train, y_test = train_test_split(kbest_df, y, test_size=0.25, random_state=42)

In [88]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [89]:
# Model creation 
regr = RandomForestRegressor(n_estimators=100)

In [90]:
# Learn the model on test sample
regr.fit(X_train, y_train)  

In [91]:
# Hotel rating prediction
y_pred = regr.predict(X_test)
# MAPE value
print(f'MAPE: {metrics.mean_absolute_percentage_error(y_test, y_pred) * 100:.3f} %')

MAPE: 13.232 %
