# 0 Begin chapter

## 0.1. install

In [7]:
!python -m pip install pandas pyarrow nltk ratelimit




[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip





## 0.2. Import

In [2]:
import pandas as pd
import numpy as np
import sys
import json
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MinMaxScaler
from ratelimit import limits
from config import api_key_mapquests
import requests
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import clear_output
from tqdm.notebook import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [4]:
sys.version
# '3.11.6 (tags/v3.11.6:8b6ee5b, Oct  2 2023, 14:57:12) [MSC v.1935 64 bit (AMD64)]'

'3.11.6 (tags/v3.11.6:8b6ee5b, Oct  2 2023, 14:57:12) [MSC v.1935 64 bit (AMD64)]'

## 0.3. convert csv to parquet

In [5]:
hotels = pd.read_csv('data/hotels.zip')
# hotels.to_parquet('data/hotels_parquet.gzip', engine='pyarrow', compression='gzip')

## 0.4. Functions

In [6]:
hotels = hotels.convert_dtypes()

In [6]:
hotels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               386803 non-null  string 
 1   additional_number_of_scoring                386803 non-null  Int64  
 2   review_date                                 386803 non-null  string 
 3   average_score                               386803 non-null  Float64
 4   hotel_name                                  386803 non-null  string 
 5   reviewer_nationality                        386803 non-null  string 
 6   negative_review                             386803 non-null  string 
 7   review_total_negative_word_counts           386803 non-null  Int64  
 8   total_number_of_reviews                     386803 non-null  Int64  
 9   positive_review                             386803 non-null  string 
 

## 0.5. INFO
---
- hotel_address — адрес отеля;
- review_date — дата, когда рецензент разместил соответствующий отзыв;
- average_score — средний балл отеля, рассчитанный на основе последнего комментария за последний год;
- hotel_name — название отеля;
- reviewer_nationality — страна рецензента;
- negative_review — отрицательный отзыв, который рецензент дал отелю;
- review_total_negative_word_counts — общее количество слов в отрицательном отзыв;
- positive_review — положительный отзыв, который рецензент дал отелю;
- review_total_positive_word_counts — общее количество слов в положительном отзыве.
- reviewer_score — оценка, которую рецензент поставил отелю на основе своего опыта;
- total_number_of_reviews_reviewer_has_given — количество отзывов, которые рецензенты дали в прошлом;
- total_number_of_reviews — общее количество действительных отзывов об отеле;
- tags — теги, которые рецензент дал отелю;
- days_since_review — количество дней между датой проверки и датой очистки;
- additional_number_of_scoring — есть также некоторые гости, которые просто поставили оценку сервису, но не оставили отзыв. Это число указывает, сколько там действительных оценок без проверки.
- lat — географическая широта отеля;
- lng — географическая долгота отеля.
---

# 1 Исследование данных

## 1.1. Отделение Категориальных признаков

In [8]:
resolution = hotels.shape[0]
tmp_cols = (
            hotels
            .nunique()
            .sort_values(ascending=False)
            .to_frame(name='count')
            .assign(frec=lambda x: round(100*x['count']/resolution).astype('UInt8'))
            )

cat_cols = (
            tmp_cols
            .query('frec < 20')
            .index
            .to_list()
            )

other_cols = (
                tmp_cols
                .query('frec >= 20')
                .index
                .to_list()
                )

print(f'cat_cols: {len(cat_cols)}; other_cols: {len(other_cols)}')


cat_cols: 15; other_cols: 2


## 1.2. Преобразование Категориальных признаков

### 1.2.1. Преобразование типов Object

In [9]:
object_cols = hotels[cat_cols].select_dtypes(include='string').columns.to_list()
# other_cat_cols = [x for x in cat_cols if x not in object_cols]
other_cat_cols = list(filter(lambda x: x not in object_cols, cat_cols))

hotels[object_cols].head()

Unnamed: 0,tags,hotel_address,hotel_name,days_since_review,review_date,reviewer_nationality
0,"[' Leisure trip ', ' Couple ', ' Studio Suite ...",Stratton Street Mayfair Westminster Borough Lo...,The May Fair Hotel,531 day,2/19/2016,United Kingdom
1,"[' Business trip ', ' Couple ', ' Standard Dou...",130 134 Southampton Row Camden London WC1B 5AF...,Mercure London Bloomsbury Hotel,203 day,1/12/2017,United Kingdom
2,"[' Leisure trip ', ' Solo traveler ', ' Modern...",151 bis Rue de Rennes 6th arr 75006 Paris France,Legend Saint Germain by Elegancia,289 day,10/18/2016,China
3,"[' Leisure trip ', ' Solo traveler ', ' Standa...",216 Avenue Jean Jaures 19th arr 75019 Paris Fr...,Mercure Paris 19 Philharmonie La Villette,681 day,9/22/2015,United Kingdom
4,"[' Business trip ', ' Couple ', ' Standard Dou...",Molenwerf 1 1014 AG Amsterdam Netherlands,Golden Tulip Amsterdam West,516 day,3/5/2016,Poland


In [10]:
# Преобразование столбца "tags"
hotels['tags'] = hotels['tags'].str.replace('\'','\"' ).apply(lambda x: json.loads(x))

# Преобразование признака "days_since_review"
hotels['days_since_review'] = hotels['days_since_review'].str.extract('(\d+) day', expand=False).astype('UInt16')

# Преобразование признака "review_date"
hotels['review_date'] = pd.to_datetime(hotels['review_date'])

# Преобразование признака  "reviewer_nationality"
hotels['hotel_name'] = hotels['hotel_name'].str.strip().replace('', np.nan)
category_order_list = sorted([x for x in hotels['hotel_name'].unique() if not pd.isnull(x)])
hotels['hotel_name'] = hotels['hotel_name'].astype('category').cat.set_categories(category_order_list, ordered=True)

# Преобразование признака  "reviewer_nationality"
hotels['reviewer_nationality'] = hotels['reviewer_nationality'].str.strip().replace('', np.nan)
category_order_list = sorted([x for x in hotels['reviewer_nationality'].unique() if not pd.isnull(x)])
hotels['reviewer_nationality'] = hotels['reviewer_nationality'].astype('category').cat.set_categories(category_order_list, ordered=True)

# Преобразование признака "hotel_address"
hotels['hotel_address'] = hotels['hotel_address'].astype('string')

### 1.2.2. Преобразование типов других типов

In [11]:
hotels[other_cat_cols].head()

Unnamed: 0,lat,lng,total_number_of_reviews,additional_number_of_scoring,review_total_negative_word_counts,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,average_score
0,51.507894,-0.143671,1994,581,3,4,7,10.0,8.4
1,51.521009,-0.123097,1361,299,3,2,14,6.3,8.3
2,48.845377,2.325643,406,32,6,0,14,7.5,8.9
3,48.888697,2.39454,607,34,0,11,8,10.0,7.5
4,52.385601,4.84706,7586,914,4,20,10,9.6,8.5


In [12]:
hotels['lat'] = hotels['lat'].astype('Float32')
hotels['lng'] = hotels['lng'].astype('Float32')
hotels['total_number_of_reviews'] = hotels['total_number_of_reviews'].astype('UInt16')
hotels['additional_number_of_scoring'] = hotels['additional_number_of_scoring'].astype('UInt16')
hotels['review_total_negative_word_counts'] = hotels['review_total_negative_word_counts'].astype('UInt16')
hotels['review_total_positive_word_counts'] = hotels['review_total_positive_word_counts'].astype('UInt16')
hotels['total_number_of_reviews_reviewer_has_given'] = hotels['total_number_of_reviews_reviewer_has_given'].astype('UInt16')
hotels['reviewer_score'] = hotels['reviewer_score'].astype('Float32')
hotels['average_score'] = hotels['average_score'].astype('Float32')

In [13]:
hotels[other_cat_cols].head()

Unnamed: 0,lat,lng,total_number_of_reviews,additional_number_of_scoring,review_total_negative_word_counts,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,average_score
0,51.507893,-0.143671,1994,581,3,4,7,10.0,8.4
1,51.521008,-0.123097,1361,299,3,2,14,6.3,8.3
2,48.845379,2.325643,406,32,6,0,14,7.5,8.9
3,48.888699,2.39454,607,34,0,11,8,10.0,7.5
4,52.385601,4.84706,7586,914,4,20,10,9.6,8.5


In [14]:
hotels['negative_review'] = hotels['negative_review'].astype('string')
hotels['positive_review'] = hotels['positive_review'].astype('string')

### 1.2.3. Работа с признаком "Tags"

In [None]:
# a = (
#         hotels['tags']
#         .explode()
#         .str.strip()
#         .value_counts()
#         .to_frame()
#         )
# tags_unique_list = dict(zip(b:=(
#                                 pd.DataFrame(
#                                                 zip(
#                                                         a.index,
#                                                         scaler.fit_transform(a.values).flatten()
#                                                         ),
#                                                 columns=('tags', 'proportion')
#                                             )
#                                 .query('proportion > 0.2')['tags']
#                                 .values
#                                 ),
#                                 [False]*len(b)
#                             ))

In [15]:
tags_unique_list = dict(zip(a:=(
                            hotels['tags']
                            .explode()
                            .str.strip()
                            # .unique()
                            .value_counts()
                            .to_frame()
                            .assign(count=lambda x: x['count']/hotels.shape[0])
                            .query('count > 0.01')
                            .index
                            ),
                            [False]*len(a)
                            ))


 
def get_dummis_table(row: pd.Series, tags_unique_list: dict):
    """
    
    """
    import pandas as pd
    tmp_dict =tags_unique_list.copy()
    for item in list(map(lambda x: x.strip(), row)):
        if item in tmp_dict.keys():
            tmp_dict[item] = True
    return pd.Series(tmp_dict)


hotels = pd.concat((hotels, hotels['tags'].parallel_apply(get_dummis_table, args=(tags_unique_list,))), axis=1)
clear_output()
hotels.shape


(386803, 51)

### 1.2.4. Работа с отзывами.

In [16]:
nltk.download('stopwords')
nltk.download('punkt')

# Удаление пунктуации
def remove_punctuation(text: str):
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Убираем стом слова
def remove_stopwords(text: str):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_text)

# Нормализуем окончания
def normalize_endings(text: str):
    from nltk.tokenize import word_tokenize
    from nltk.stem import SnowballStemmer
    
    stemmer = SnowballStemmer('english')
    tokens = word_tokenize(text)
    normalized_text = [stemmer.stem(word) for word in tokens]
    return ' '.join(normalized_text)


hotels['negative_review'] = hotels['negative_review'].progress_apply(remove_punctuation)
clear_output()

hotels['negative_review'] = hotels['negative_review'].parallel_apply(remove_stopwords)
clear_output()

hotels['negative_review'] = hotels['negative_review'].parallel_apply(normalize_endings)
clear_output()

hotels['positive_review'] = hotels['positive_review'].progress_apply(remove_punctuation)
clear_output()

hotels['positive_review'] = hotels['positive_review'].parallel_apply(remove_stopwords)
clear_output()

hotels['positive_review'] = hotels['positive_review'].parallel_apply(normalize_endings)
clear_output()

hotels['positive_review'] = hotels['positive_review'].str.split()
hotels['negative_review'] = hotels['negative_review'].str.split()

In [17]:
review_unique_dict = dict(zip(a:=(
                                pd.concat((hotels['positive_review'], hotels['negative_review']), axis=0, ignore_index=True)
                                .explode(ignore_index=True)
                                # .unique()
                                .value_counts()
                                .to_frame()
                                .assign(count=lambda x: x['count']/hotels.shape[0])
                                .query('count > 0.01')
                                .index
                                ),
                              [False]*len(a)
                              ))



def get_dummis_review_table(row: pd.Series, review_unique_dict: dict):
    import pandas as pd
    tmp_dict = review_unique_dict.copy()
    for item in row['positive_review']:
        if item in tmp_dict.keys():
            tmp_dict[item] = True
    for item in row['negative_review']:
        if item in tmp_dict.keys():
            tmp_dict[item] = True
    return pd.Series(tmp_dict)

hotels = pd.concat((hotels, hotels.parallel_apply(get_dummis_review_table, args=(review_unique_dict,), axis=1)), axis=1)
clear_output()
hotels.shape

(386803, 404)

In [18]:
hotels = (
            hotels
            .drop(columns=(
                            (hotels.iloc[:,17:].sum(axis=0)/hotels.shape[0])
                            .to_frame(name='proportion')
                            .query('proportion < 0.01')
                            .index
                        ))
            # .drop(columns=['negative_review', 'positive_review', 'tags'])
        )
hotels.shape

(386803, 384)

In [19]:
cols_bool = (hotels.iloc[:,17:].sum(axis=0)/hotels.shape[0]).to_frame(name='proportion')
cols_bool.to_parquet('data/cols_bool_parquet.gzip', engine='pyarrow', compression='gzip')

### 1.2.5. Работа с адресами.

In [20]:
address_guide = pd.read_parquet('data/address_guide_parquet.gzip')

In [129]:
def _get_json_address(col_value: str, api_key: str) -> str:
    import requests
    import numpy as np
    import pandas as pd
    url = 'https://www.mapquestapi.com/geocoding/v1/address'
    params = {
        'key': api_key,
        'location': col_value,
    }
    
    cols = ['country', 'city', 'Neighborhood']
    
    
    try:
        res = requests.get(url=url, params=params)
    except TimeoutError:
        return pd.Series([np.nan]*len(cols), index=cols)
    except requests.ConnectTimeout:
        return pd.Series([np.nan]*len(cols), index=cols)
    
    tmp = res.json()
    
    country = tmp.get('results', [{}])[0].get('locations', [{}])[0].get('adminArea1')
    # state = tmp.get('results', [{}])[0].get('locations', [{}])[0].get('adminArea3')
    # county = tmp.get('results', [{}])[0].get('locations', [{}])[0].get('adminArea4')
    city = tmp.get('results', [{}])[0].get('locations', [{}])[0].get('adminArea5')
    Neighborhood = tmp.get('results', [{}])[0].get('locations', [{}])[0].get('adminArea6')
    # street = tmp.get('results', [{}])[0].get('locations', [{}])[0].get('street')
    
    return pd.Series([country, city, Neighborhood], index=cols)

In [130]:
tmp = pd.DataFrame(hotels['hotel_address'].unique(), columns=['address'])
address_guide = pd.concat((tmp, tmp['address'].parallel_apply(_get_json_address, args=(api_key_mapquests,))), axis=1)
clear_output()
# Преобразование признаков
for col in address_guide.columns[1:]:
    category_order_list = sorted([x for x in address_guide[col].unique() if not pd.isnull(x)])
    address_guide[col] = address_guide[col].astype('category').cat.set_categories(category_order_list, ordered=True)
address_guide.nunique()

address         1493
country            8
city              14
Neighborhood     219
dtype: int64

In [191]:
address_guide.to_parquet('data/address_guide_parquet.gzip', engine='pyarrow', compression='gzip')

In [21]:
# hotels = hotels.merge(right=address_guide, how='left', left_on='hotel_address', right_on='address').drop(columns=['hotel_address', 'address'])
hotels = hotels.merge(right=address_guide, how='left', left_on='hotel_address', right_on='address').drop(columns='address')

In [29]:
hotels.to_parquet('data/hotels_v3_parquet.gzip', engine='pyarrow', compression='gzip')

# 2. Подготовка данных для модели

In [51]:
hotels = pd.read_parquet('data/hotels_v2_parquet.gzip')

In [43]:
hotels['review_date_from_2015-01-01'] = (hotels['review_date'] - pd.to_datetime('2015-01-01')).dt.days.astype('UInt16')
drop_col_list = list(hotels.select_dtypes('object').columns)
drop_col_list.extend(list(hotels.select_dtypes('string').columns))
drop_col_list.extend(list(hotels.select_dtypes('datetime64[ns]').columns))

hotels = hotels.drop(columns=drop_col_list)
hotels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 40 columns):
 #   Column                                      Non-Null Count   Dtype   
---  ------                                      --------------   -----   
 0   additional_number_of_scoring                386803 non-null  UInt16  
 1   average_score                               386803 non-null  Float32 
 2   hotel_name                                  386803 non-null  category
 3   reviewer_nationality                        386408 non-null  category
 4   review_total_negative_word_counts           386803 non-null  UInt16  
 5   total_number_of_reviews                     386803 non-null  UInt16  
 6   review_total_positive_word_counts           386803 non-null  UInt16  
 7   total_number_of_reviews_reviewer_has_given  386803 non-null  UInt16  
 8   reviewer_score                              386803 non-null  Float32 
 9   days_since_review                           386803 non-null

In [44]:
for col in hotels.select_dtypes('category').columns:
    hotels[col] = (hotels[col].cat.codes + 1).fillna(0)

hotels = hotels.fillna(0)
    

In [45]:
# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели  
# Х - данные с информацией об отелях, у - целевая переменная (рейтинги отелей)  
X = hotels.drop(['reviewer_score'], axis = 1)  
y = hotels['reviewer_score'] 

In [46]:
# Загружаем специальный инструмент для разбивки:  
from sklearn.model_selection import train_test_split  

In [47]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.  
# Для тестирования мы будем использовать 25% от исходного датасета.  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [48]:
# Импортируем необходимые библиотеки:  
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели  
from sklearn import metrics # инструменты для оценки точности модели  
  
# Создаём модель  
regr = RandomForestRegressor(n_estimators=100)  
      
# Обучаем модель на тестовом наборе данных  
regr.fit(X_train, y_train)  
      
# Используем обученную модель для предсказания рейтинга отелей в тестовой выборке.  
# Предсказанные значения записываем в переменную y_pred  
y_pred = regr.predict(X_test)  


In [49]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они отличаются  
# Метрика называется Mean Absolute Percentage Error (MAPE) и показывает среднюю абсолютную процентную ошибку предсказанных значений от фактических.  
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.13209802929632247


# 3. EDA

In [10]:
hotels = pd.read_parquet('data/hotels_v3_parquet.gzip')
hotels.shape

(386803, 407)

In [18]:
hotels = hotels.dropna(axis=0)
hotels.isnull().sum().sort_values(ascending=False)


hotel_address    0
manag            0
tram             0
wall             0
ok               0
                ..
modern           0
metro            0
back             0
window           0
Neighborhood     0
Length: 407, dtype: int64

In [51]:
# task 4.2
hotels['hotel_name'].nunique()

1492

In [52]:
# task 4.3
hotels['review_date'].describe()

count                           386803
mean     2016-08-13 14:41:18.712936704
min                2015-08-04 00:00:00
25%                2016-02-23 00:00:00
50%                2016-08-15 00:00:00
75%                2017-02-09 00:00:00
max                2017-08-03 00:00:00
Name: review_date, dtype: object

In [53]:
# task 4.4
hotels['tags'].explode().nunique()

2368

In [54]:
# task 4.5
hotels['tags'].explode().str.strip().value_counts()

tags
Leisure trip                         313593
Submitted from a mobile device       230778
Couple                               189212
Stayed 1 night                       145373
Stayed 2 nights                      100263
                                      ...  
Studio with Spa Access                    1
Comfort Family Room                       1
Junior Suite Free Wifi                    1
Design Suite                              1
Executive Double Room Non Smoking         1
Name: count, Length: 2368, dtype: int64

In [12]:
# task 4.6
tmp = (
            hotels['tags']
            .explode()
            .str.strip()
            .value_counts()
            .reset_index()
        )
            
tmp[tmp['tags'].str.contains('(Stayed)', regex=True)].sort_values('count', ascending=False).head(10)

  tmp[tmp['tags'].str.contains('(Stayed)', regex=True)].sort_values('count', ascending=False).head(10)


Unnamed: 0,tags,count
3,Stayed 1 night,145373
4,Stayed 2 nights,100263
6,Stayed 3 nights,72000
10,Stayed 4 nights,35748
17,Stayed 5 nights,15611
22,Stayed 6 nights,7399
26,Stayed 7 nights,5549
53,Stayed 8 nights,1910
76,Stayed 9 nights,966
98,Stayed 10 nights,663


In [31]:
test = hotels.copy()
test = test.drop(columns=['hotel_address', 'negative_review', 'tags', 'positive_review'])
for col in test.select_dtypes('category').columns:
    test[col] = test[col].cat.codes

test = test.fillna(0)

In [32]:
corr_mat = test.corr().stack().reset_index(name="correlation")
corr_mat['correlation_abs'] = corr_mat['correlation'].abs()
# sns.set_style("ticks",{'axes.grid' : True})
# g = sns.relplot(
#                 data=corr_mat,
#                 x="level_0",
#                 y="level_1",
#                 hue="correlation",
#                 size="correlation2",
#                 height=7,
#                 aspect=1.5,
#                 palette="vlag",
#                 edgecolor=".8",
#                 # sizes=(50, 250),
#                 # size_norm=(-.2, .8),
#                 # dashes=True
#                 )
# g.set(xlabel="", ylabel="")
# g.despine(left=True, bottom=True)
# g.ax.margins(.02)
# for label in g.ax.get_xticklabels():
#     label.set_rotation(90)

In [33]:
corr_mat.query('(correlation_abs > 0.7) & (level_0 != level_1)')

Unnamed: 0,level_0,level_1,correlation,correlation_abs
6,additional_number_of_scoring,total_number_of_reviews,0.824467,0.824467
393,review_date,days_since_review,-1.0,1.0
2298,total_number_of_reviews,additional_number_of_scoring,0.824467,0.824467
3831,days_since_review,review_date,-1.0,1.0
4999,Leisure trip,Business trip,-0.904145,0.904145
7673,Business trip,Leisure trip,-0.904145,0.904145


In [34]:
corr_mat.query('correlation_abs > 0.5 & correlation_abs <= 0.7')

Unnamed: 0,level_0,level_1,correlation,correlation_abs
4997,Leisure trip,Solo traveler,-0.509337,0.509337
5763,Couple,Solo traveler,-0.504555,0.504555
6907,Solo traveler,Leisure trip,-0.509337,0.509337
6909,Solo traveler,Couple,-0.504555,0.504555
6914,Solo traveler,Business trip,0.52859,0.52859
7678,Business trip,Solo traveler,0.52859,0.52859
41903,coffe,tea,0.508329,0.508329
53474,air,condit,0.671162,0.671162
56841,money,valu,0.616421,0.616421
59857,tea,coffe,0.508329,0.508329


In [35]:
drop_col = ['additional_number_of_scoring', 'review_date', 'Business trip', 'valu', 'Standard Double or Twin Room', 'Classic Double or Twin Room', 'close', 'back', 'qualiti', 'best', 'high', 'london', 'friend', 'Twin Room', 'stop', 'floor', 'use', 'made']
test = test.drop(columns=drop_col, errors='ignore')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Columns: 365 entries, average_score to Neighborhood
dtypes: Float32(4), UInt16(5), bool(351), int16(3), int8(2)
memory usage: 145.3 MB


# 4. Final

In [36]:
# Загружаем специальный инструмент для разбивки:  
from sklearn.model_selection import train_test_split
# Импортируем необходимые библиотеки:  
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели  
from sklearn import metrics # инструменты для оценки точности модели  

In [37]:
# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели  
# Х - данные с информацией об отелях, у - целевая переменная (рейтинги отелей)  
X = test.drop(['reviewer_score'], axis = 1)  
y = test['reviewer_score'] 

In [38]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.  
# Для тестирования мы будем использовать 25% от исходного датасета.  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [39]:
# Создаём модель  
regr = RandomForestRegressor(n_estimators=100)  
      
# Обучаем модель на тестовом наборе данных  
regr.fit(X_train, y_train)  
      
# Используем обученную модель для предсказания рейтинга отелей в тестовой выборке.  
# Предсказанные значения записываем в переменную y_pred  
y_pred = regr.predict(X_test)  

In [40]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они отличаются  
# Метрика называется Mean Absolute Percentage Error (MAPE) и показывает среднюю абсолютную процентную ошибку предсказанных значений от фактических.  
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.12431063138028595
