![](https://github.com/dkataiev/skillfactory_rds/raw/master/resources/img/tripadvisor-logo.png)
# Predict TripAdvisor Rating
## В этом соревновании нам предстоит предсказать рейтинг ресторана в TripAdvisor

# Загрузка Pandas и очистка данных

In [99]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/.DS_Store
./data/ta_data.csv
./data/main_task.csv
./data/ta_data/.DS_Store
./data/ta_data/urls_30000-35000_data.json
./data/ta_data/urls_0-5000_data.json
./data/ta_data/urls_5000-10000_data.json
./data/ta_data/urls_15000-20000_data.json
./data/ta_data/urls_25000-30000_data.json
./data/ta_data/urls_35000-40000_data.json
./data/ta_data/urls_20000-25000_data.json
./data/ta_data/urls_10000-15000_data.json
./data/urls/urls_20000-25000.csv
./data/urls/urls_5000-10000.csv
./data/urls/urls_15000-20000.csv
./data/urls/urls_25000-30000.csv
./data/urls/.DS_Store
./data/urls/urls_35000-40000.csv
./data/urls/urls_10000-15000.csv
./data/urls/urls_0-5000.csv
./data/urls/urls_30000-35000.csv


In [100]:
DATA_DIR = './data'

data = pd.read_csv(DATA_DIR + '/main_task.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        26114 non-null  object 
 6   Number of Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.1+ MB


#### Подробнее по признакам:
* `City`: Город 
* `Cuisine Style`: Кухня
* `Ranking`: Ранг ресторана относительно других ресторанов в этом городе
* `Price Range`: Цены в ресторане в 3 категориях
* `Number of Reviews`: Количество отзывов
* `Reviews`: 2 последних отзыва и даты этих отзывов
* `URL_TA`: страница ресторана на 'www.tripadvisor.com' 
* `ID_TA`: ID ресторана в TripAdvisor
* `Rating`: Рейтинг ресторана

In [101]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
27579,id_843,Prague,"['Lebanese', 'Mediterranean', 'Middle Eastern'...",844.0,4.0,$$ - $$$,118.0,"[['Good Lebanese food', 'Nice middle Eastern f...",/Restaurant_Review-g274707-d3544293-Reviews-Kl...,d3544293
11028,id_811,Oporto,,812.0,4.0,,38.0,"[['Best chocolate cake', ""They're not wrong - ...",/Restaurant_Review-g189180-d6163786-Reviews-O_...,d6163786
14067,id_2482,Barcelona,['Spanish'],2483.0,5.0,$,18.0,"[['Great service - great food', 'Great spot fo...",/Restaurant_Review-g187497-d8396349-Reviews-Ca...,d8396349
4943,id_2200,Lisbon,"['Asian', 'Thai', 'Vegetarian Friendly']",2203.0,4.0,$$ - $$$,58.0,"[['Delicious food, friendly staff', 'Ok'], ['0...",/Restaurant_Review-g189158-d4300612-Reviews-Th...,d4300612
17311,id_111,Brussels,"['French', 'Belgian', 'European', 'Vegetarian ...",112.0,4.5,$$$$,406.0,"[['Great experience', 'Amazing.'], ['02/23/201...",/Restaurant_Review-g188644-d786404-Reviews-Le_...,d786404


#### Number of Reviews

In [102]:
def preproc_number_of_reviews(df):
    # Вынесем информацию о наличии пропуска как отдельный признак
    df['Number_of_Reviews_isNAN'] = pd.isna(df['Number of Reviews']).astype('uint8')
    # Далее заполняем пропуски 0, вы можете попробовать заполнением средним или средним по городу и тд...
    df['Number of Reviews'].fillna(0, inplace=True)
    return df

data = preproc_number_of_reviews(data)

#### Price Range

In [103]:
data['Price Range'].value_counts()

$$ - $$$    18412
$            6279
$$$$         1423
Name: Price Range, dtype: int64

По описанию 'Price Range' это - Цены в ресторане.
Их можно поставить по возрастанию (значит это не категориальный признак). А это значит, что их можно заменить последовательными числами, например 1,2,3

In [104]:
def preproc_price_range(df):
    # Заменим категориальный признак числами
    df['Price Range'].replace(to_replace={ '$': 1, '$$ - $$$': 2, '$$$$': 3}, inplace=True)
    # Сохраним данные о записях, где не был указан уровень цен.
    df['Price_Range_isNAN'] = pd.isna(df['Price Range']).astype('uint8')
    # Заполним пропуски, пока что воспользуемся просто медианой.
    df['Price Range'].fillna(df['Price Range'].median(), inplace=True)
    return df

data = preproc_price_range(data)

In [105]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,Price_Range_isNAN
30134,id_435,Helsinki,"['Bar', 'Pub']",436.0,4.5,2.0,15.0,"[['Smallish privately owned pub', 'best select...",/Restaurant_Review-g189934-d8373416-Reviews-On...,d8373416,0,1
913,id_2773,Amsterdam,,2780.0,4.0,2.0,0.0,"[[], []]",/Restaurant_Review-g188590-d10467094-Reviews-A...,d10467094,1,1
30356,id_784,Dublin,"['Irish', 'Bar', 'Pub', 'European']",785.0,3.5,2.0,188.0,"[['Family Gathering', 'Good selection on the b...",/Restaurant_Review-g186605-d716030-Reviews-Tur...,d716030,0,0
2065,id_83,Paris,"['Lebanese', 'Mediterranean', 'Middle Eastern'...",84.0,4.5,2.0,952.0,"[['Awesome restaurant, vibrant atmosphere!', '...",/Restaurant_Review-g187147-d897021-Reviews-Lou...,d897021,0,0
29331,id_1074,Athens,,1076.0,5.0,2.0,0.0,"[['a friendly stop in this part of Kolonaki'],...",/Restaurant_Review-g189400-d11767612-Reviews-L...,d11767612,1,1


#### Cuisine Style

Для начала добавим к записям признак с колличеством представленых типов кухонь. Если в данных отсутствует информация о типах кухонь, то считаем, что в этом ресторане предлагается только один тип кухни. Так же пометим записи, где не укзан тип кухни.

In [106]:
def string_to_list(s):
    return s[1:-1].replace("'", "").split(', ')

def count_styles(s):
    return len(string_to_list(s)) if s is not np.nan else 1

def get_cuisines_list(df):
    cuisines = set()
    for cuisine in df['Cuisine Style'].dropna().values:
        cuisines.update(string_to_list(cuisine))
    return cuisines

def update_cuisines_rating(cuisine_styles, cuisines_ratings):
        for style in cuisine_styles:
            cuisines_ratings[style] = cuisines_ratings[style] + 1

def calc_cuisines_ratings(df, cuisines):
    cuisines_ratings = {}
    for cuisine in cuisines:
        cuisines_ratings[cuisine] = 0
    
    for cuisine in df['Cuisine Style'].dropna().values:
        update_cuisines_rating(string_to_list(cuisine), cuisines_ratings)
    return cuisines_ratings

def preproc_cuisine_style(df):
    # добавим к записям признак с колличеством представленых типов кухонь
    df['Cuisine Style Count'] = df['Cuisine Style'].apply(count_styles)
    # пометим записи, где не укзан тип кухни.
    df['Cuisine_Style_isNAN'] = pd.isna(df['Cuisine Style']).astype('uint8')

    # отдельными признаками добавим типы кухни, которые представлены хотя бы в 1% ресторанов.
    # соберем список всех кухонь представленых в датасете.
    cuisines_list = get_cuisines_list(df)

    # соберем данные о том, в скольки ресторанах представлена каждая кухня.
    cuisines_rating = calc_cuisines_ratings(df, cuisines_list)

    # выделим две группы кухонь те которые представлены хотя бы в 1% ресторанов и остальные
    top_cuisines = []
    low_cuisines = []
    for cuisine in cuisines_rating:
        if cuisines_rating[cuisine] > len(data) * .01:
            top_cuisines.append(cuisine)
        else:
            low_cuisines.append(cuisine)

    # создадим отдельные признаки для кухонь представленых хотя бы в 1%
    def top_cuisine_present(x):
        if pd.isna(x): 
            return 0
        return 1 if cuisine in x else 0

    for cuisine in top_cuisines:
        dummy = df['Cuisine Style'].apply(top_cuisine_present)
        df['Cuisine_' + cuisine] = dummy

    # для остальных добавим признак Cuisine_Other
    def is_low_cuisine_present(x):
        if pd.isna(x): 
            return 0
        for cuisine in low_cuisines:
            if cuisine in x:
                return 1
        return 0

    df['Cuisine_Other'] = df['Cuisine Style'].apply(is_low_cuisine_present)
    
    return df

data = preproc_cuisine_style(data)

In [107]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,...,Cuisine_French,Cuisine_Central European,Cuisine_Bar,Cuisine_Vegetarian Friendly,Cuisine_Fast Food,Cuisine_Gastropub,Cuisine_Cafe,Cuisine_International,Cuisine_Turkish,Cuisine_Other
26147,id_2373,Paris,"['Vegetarian Friendly', 'Gluten Free Options']",2374.0,4.5,2.0,40.0,"[['Authentic French, All-Vegan, DELICIOUS, Af....",/Restaurant_Review-g187147-d12337311-Reviews-V...,d12337311,...,0,0,0,1,0,0,0,0,0,0
6254,id_3381,Lisbon,"['European', 'Portuguese']",3385.0,4.0,2.0,16.0,"[['Go here if you want to be bullied', 'Nice e...",/Restaurant_Review-g189158-d3543805-Reviews-Ro...,d3543805,...,0,0,0,0,0,0,0,0,0,0
10965,id_1681,Madrid,"['Bar', 'Mediterranean', 'Spanish']",1683.0,4.0,1.0,268.0,"[['Good food and interesting ambiance', 'Very ...",/Restaurant_Review-g187514-d4758734-Reviews-Mu...,d4758734,...,0,0,1,0,0,0,0,0,0,0
38438,id_8515,Madrid,,8522.0,2.0,2.0,0.0,"[[], []]",/Restaurant_Review-g187514-d6715817-Reviews-St...,d6715817,...,0,0,0,0,0,0,0,0,0,0
24440,id_1221,Copenhagen,['Healthy'],1223.0,4.5,2.0,4.0,"[['Great sandwiches', 'Best sandwich in a long...",/Restaurant_Review-g189541-d1012320-Reviews-Ku...,d1012320,...,0,0,0,0,0,0,0,0,0,0


#### Reviews

Добавим признаки с датами последнего и предпоследнего отзывов, а также колличество дней между ними.

In [108]:
def get_dates_list(x):
    dates = x.split('], [')[1]
    dates = dates[:-2]
    dates = dates.replace("'", "")
    return dates.split(', ')

def get_penultimate_review(x):
    dates = get_dates_list(x)
    return pd.to_datetime(dates[-1]) if len(dates) >= 1 else np.NaN

def get_last_review(x):
    dates = get_dates_list(x)
    return pd.to_datetime(dates[0]) if len(dates) >= 1 else np.NaN

def preproc_reviews(df):
    # заполним пропуски пустыми значениями
    df['Reviews'].fillna('[[], []]', inplace=True)

    # отметим записи коментарии в которых отсутствуют
    df['Reviews_NA'] = (df['Reviews'] == '[[], []]').astype(int)

    # создадим признаки с датами двух последних коментариев
    df['Penultimate_Review'] = df['Reviews'].apply(lambda x: get_penultimate_review(x))
    df['Last_Review'] = df['Reviews'].apply(lambda x: get_last_review(x))

    # создадим признак количеством дней прошедшими между двумя последними коментариями
    df = df.assign(Review_Time_Delta = lambda x: (x['Last_Review'] - x['Penultimate_Review']).dt.days)

    # звполним пустые записи
    df['Review_Time_Delta'] = df['Review_Time_Delta'].fillna(0)
    return df

data = preproc_reviews(data)

In [109]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,...,Cuisine_Fast Food,Cuisine_Gastropub,Cuisine_Cafe,Cuisine_International,Cuisine_Turkish,Cuisine_Other,Reviews_NA,Penultimate_Review,Last_Review,Review_Time_Delta
22591,id_5126,Barcelona,['Asian'],5127.0,3.5,2.0,95.0,[['Nice atmosphere but definitely not a club.....,/Restaurant_Review-g187497-d1858535-Reviews-CL...,d1858535,...,0,0,0,0,0,0,0,2017-07-12,2017-08-06,25.0
5771,id_109,Prague,"['Italian', 'Pizza', 'Mediterranean', 'Europea...",110.0,4.5,2.0,789.0,"[['Great lunch, friendly people', 'Lunch and a...",/Restaurant_Review-g274707-d4130885-Reviews-Pi...,d4130885,...,0,0,0,0,0,0,0,2018-01-09,2018-01-09,0.0
35510,id_1580,Berlin,"['Lebanese', 'Moroccan', 'Mediterranean', 'Mid...",1582.0,4.5,2.0,16.0,[['Food with great flavor from a multi-talent....,/Restaurant_Review-g187323-d11925522-Reviews-R...,d11925522,...,0,0,0,0,0,1,0,2017-03-03,2017-05-20,78.0
33621,id_463,Prague,"['Bar', 'European', 'Czech', 'Eastern European...",464.0,4.0,2.0,220.0,"[['Good place in the heart of the city', 'OK p...",/Restaurant_Review-g274707-d7155220-Reviews-U_...,d7155220,...,0,0,0,1,0,0,0,2017-11-28,2017-12-30,32.0
38287,id_694,Prague,"['Italian', 'Pizza', 'Mediterranean', 'Europea...",695.0,4.0,2.0,77.0,"[['Great, as always', 'Still good, but depends...",/Restaurant_Review-g274707-d12794680-Reviews-V...,d12794680,...,0,0,0,0,0,0,0,2017-12-29,2018-01-07,9.0


Попробуем по тексту двух последних коментариев понять были ли они положительными либо отрицательными.

In [110]:
data['Reviews'].sample(5)

23871    [['Quality/price ratio very good, even too go....
13065    [['Authentic Pad Thai', 'Asiatique food'], ['0...
4376     [['Mellow, urbane elegance; good wine', 'A tur...
11841    [['Late lunch', 'Quick hassle free dinner on a...
19361    [['Great food, excellent service', 'Fantastic ...
Name: Reviews, dtype: object

In [None]:
chars_to_replace = "',.!?&-+/\""

def get_reviews_words(x):
    reviews = x.split('], [')[0]
    reviews = reviews[2:]
    reviews = reviews.split("', '")
    if len(reviews) == 1:
        return []

    words = set()
    for review in reviews:
        for ch in chars_to_replace:
            review = review.replace(ch, '')
        review = review.lower()
    return words

def count_review_words(x):
    words = get_reviews_words(x)

# для начала соберем список слов которые есть в коментариях.
# data['Reviews'].sample(10).apply(count_review_words)

# в процессе выяснилось, что подавляющее большинство коментариев положительные, пока отложим это

#### Cities

In [73]:
# для One-Hot Encoding в pandas есть готовая функция - get_dummies. Особенно радует параметр dummy_na

def preproc_cities(df):

    # Прежде, чем заменять признак 'City' dummy-переменными, сохраним на будущее некоторую информацию.
    # Соберем среднее значение критерия Ranking по городам
    stats_by_city = df.groupby(['City']).median()   
    
    # Для каждой строчки добавим признак с отклонением от среднего значения соответствующего города
    df['Ranking_Delta'] = df.apply(lambda x: x['Ranking'] - stats_by_city['Ranking'][x['City']], axis=1)
    df['Cuisine_Style_Count_Delta'] = df.apply(lambda x: x['Cuisine Style Count'] - stats_by_city['Cuisine Style Count'][x['City']], axis=1)
    df['Price_Range_Delta'] = df.apply(lambda x: x['Price Range'] - stats_by_city['Price Range'][x['City']], axis=1)
    df['Reviews_Number_City_Median_Delta'] = df.apply(lambda x: x['Number of Reviews'] - stats_by_city['Number of Reviews'][x['City']], axis=1)

    df = pd.get_dummies(df, columns=['City'], dummy_na=True)
    return df

data = preproc_cities(data)

In [74]:
data.sample(5)

Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,...,City_Oporto,City_Oslo,City_Paris,City_Prague,City_Rome,City_Stockholm,City_Vienna,City_Warsaw,City_Zurich,City_nan
12289,id_15861,"['Fast Food', 'American']",15873.0,2.0,1.0,14.0,"[['Worst KFC iv ever seen', 'Pre-film lunch'],...",/Restaurant_Review-g186338-d5442988-Reviews-KF...,d5442988,0,...,0,0,0,0,0,0,0,0,0,0
18950,id_1221,"['Mediterranean', 'Turkish', 'Middle Eastern']",1233.0,4.0,2.0,69.0,"[['Above average', 'Best Sunday brunch ever'],...",/Restaurant_Review-g187309-d2020076-Reviews-Di...,d2020076,0,...,0,0,0,0,0,0,0,0,0,0
7090,id_405,"['Bar', 'European', 'Pub', 'Czech', 'Eastern E...",406.0,4.0,2.0,1832.0,"[['Great range of beers!', 'Beer Beer Beer'], ...",/Restaurant_Review-g274707-d694859-Reviews-Piv...,d694859,0,...,0,0,0,1,0,0,0,0,0,0
35394,id_1946,,1948.0,4.5,2.0,3.0,"[[], []]",/Restaurant_Review-g190454-d5986184-Reviews-Be...,d5986184,0,...,0,0,0,0,0,0,1,0,0,0
12236,id_10434,['Chinese'],10436.0,3.0,1.0,67.0,"[['Delicious', 'Pretty disappointing'], ['01/0...",/Restaurant_Review-g187147-d7794226-Reviews-Hu...,d7794226,0,...,0,0,1,0,0,0,0,0,0,0


### Это "жжж" неспроста

В датасете у нас есть ссылка страничку ресторана в системе TripAdrisor. Рассмотрим несколько страниц и попробуем выделить потенциально полезную нам информацию.

Опишем необходимые для парсинга функции

In [75]:
import requests

from lxml import html
from bs4 import BeautifulSoup
from multiprocessing import Pool
from datetime import datetime

TRIP_ADVISOR_URL_TEMPLATE = 'https://www.tripadvisor.com{}'

def parse_ratings_and_reviews(node, result):
    rating_block = node.find('div').findAll('div', recursive=False)[2]
    rating_block = rating_block.findAll('div', recursive=False)
    if len(rating_block) < 2:
        return

    ratings = rating_block[1].findAll('div')
    for rating in ratings:
        spans = rating.findAll('span', recursive=False)
        title = spans[1].text.lower()
        value = spans[2].find('span').attrs['class'][1].split('_')[1]
        result[title] = int(value)

def parse_location_and_contact(node):
    location_block = node.find('div').find('div')
    location_block = location_block.findAll('div', recursive=False)[1]
    distance_el = location_block.find('b')
    if distance_el is None:
        return np.NaN
    return float(distance_el.text.split()[0])

def parse_details_block(node, result):
    if node is None:
        return

    result['is_verified'] = 1 if node.find('span', {'class': 'ui_icon verified-checkmark'}) is not None else 0
    result['has_phone_number'] = 1 if node.find('a', string='+ Add phone number') is None else 0
    result['has_hours'] = 1 if node.find('a', string='+ Add hours') is None else 0
    result['has_website'] = 1 if node.find('a', string='+ Add website') is None else 0
    result['has_menu'] = 1 if node.find('a', string='Menu') is not None else 0
        

def collect_page_data(html, result):
    soup = BeautifulSoup(html)
    overview_tabs = soup.find('div', {'data-tab': 'TABS_OVERVIEW'})
    if overview_tabs is None:
        return

    overview_columns = overview_tabs.findAll('div', {'class':'ui_column'})
    parse_ratings_and_reviews(overview_columns[0], result)
    parse_details_block(overview_columns[1], result)

    result['distance'] = parse_location_and_contact(overview_columns[2])
    result['has_tcAward'] = 1 if soup.find('img', {'class': 'tcAward'}) is not None else 0

def current_time():
    return datetime.now().strftime("%H:%M:%S")

def get_id_from_url(url):
    return url.split('-')[2]

def parse_site_page(url):
    result = {}
    result['id_ta'] = get_id_from_url(url)
    ta_url = TRIP_ADVISOR_URL_TEMPLATE.format(url)
    print(url)
    r = requests.get(ta_url, stream=True)
    print('Done')
    collect_page_data(r.text, result)
    return result

def process_ta_urls(x):
    return x.apply(parse_site_page)

In [76]:
def parallelize_processing(df, func, n_cores=8):
    pool = Pool(n_cores)
    df_split = np.array_split(df, n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# print("Started at:", current_time())

# start = 0
# step = 5000
# stop = len(data)
# n_cores = 16

# for i in range(start, stop, step):
#     print('[{}] Getting records {}:{}'.format(current_time(), i, i + step))
#     pages_data = parallelize_processing(data['URL_TA'][i:i+step], process_ta_urls, n_cores=n_cores)
#     site_df = pd.DataFrame(pages_data.values.tolist())
#     site_df.to_csv('ta_data/data_{}-{}.csv'.format(i, i+step))    

# print("Finished at:", current_time())

Все было прекрасно, но jupyther notebook вешается при попытке обработать все ссылки.

Вынесем код в отдельный скрипт и запустим.

![](https://github.com/dkataiev/skillfactory_rds/raw/master/resources/img/several-song-filled-hours-later.jpg)

Выгрузим ссылки на страницы в отдельные файлы для работы граббера.

In [77]:
# start = 0
# step = 5000
# stop = len(df)

# for i in range(start, stop, step):
#     df[['ID_TA','URL_TA']][i:i+step].to_csv('./data/urls/urls_{}-{}.csv'.format(i, i+step), header=False)

Превратим набор JSON-файлов в CSV-формат для дальнейшего удобства рабты с ними.

In [78]:
# import json

# chunks = []

# for dirname, _, filenames in os.walk(DATA_DIR + '/ta_data'):
#     for filename in filenames:
#         if not filename.endswith('.json'):
#             continue

#         with open(os.path.join(dirname, filename)) as f:
#             data_chunk = json.load(f)
#             chunks.append(pd.DataFrame(data_chunk))

# ta_df = pd.concat(chunks, ignore_index=True)
# ta_df.info()
# ta_df.sample(10)

### Почистим полученные данные и подготовим их к дальнейшей работе.

#### Заполним пропуски

Признаки 
* food
* service
* value
* atmosphere 

содержат оценку в конкретной категории. На их основе мы вычислим суммарный рейтинг, поэтому забьем пропуски нулями.

Слудующие признаки содержат номинативные показатели, нет значения - нет показателя, забиваем нолями.

* is_verified
* has_phone_number
* has_hours
* has_website
* has_menu
* has_tcAward

In [79]:
# ta_df.fillna({
#     'food':0,
#     'service':0,
#     'value':0,
#     'atmosphere':0,
#     'is_verified':0,
#     'has_phone_number':0,
#     'has_hours':0,
#     'has_website':0,
#     'has_menu':0,
#     'has_tcAward':0},
#     inplace=True)

# ta_df.info()

С признаком 'distance' все немного сложнее, это раастояние от некоего культурного центра.

Пометим записи, где он отсутствует новым признаком а потом заменим пропуски на -1 что бы отличать от реального расстояния

In [80]:
# ta_df['distance_isNAN'] = pd.isna(ta_df['distance']).astype('uint8')
# ta_df.fillna({'distance':-1},inplace=True)
# ta_df.info()

In [81]:
ta_df.sample(10)

Unnamed: 0,ta_id,food,service,value,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
16595,d2043542,35.0,30.0,30.0,0.0,1.0,1.0,1.0,0.0,0.3,0.0,35.0,0,130.0
20621,d10397411,35.0,35.0,35.0,0.0,1.0,1.0,1.0,0.0,0.2,0.0,0.0,0,105.0
27854,d5104042,40.0,45.0,45.0,0.0,1.0,1.0,1.0,0.0,0.5,0.0,35.0,0,165.0
9814,d3336691,40.0,40.0,45.0,0.0,1.0,1.0,1.0,0.0,4.5,0.0,0.0,0,125.0
35103,d9599183,35.0,40.0,35.0,0.0,1.0,1.0,1.0,0.0,0.4,0.0,0.0,0,110.0
34553,d8737537,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.3,0.0,0.0,0,0.0
21070,d12392164,45.0,45.0,45.0,0.0,1.0,1.0,1.0,0.0,0.4,1.0,0.0,0,135.0
7864,d2177461,35.0,35.0,35.0,0.0,1.0,1.0,1.0,0.0,1.8,0.0,30.0,0,135.0
18278,d5075295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,1,0.0
19035,d10071406,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.6,0.0,0.0,0,0.0


Добавим суммарный рейтинг на основе следующих признаков:

* food
* service
* value
* atmosphere

In [82]:
# ta_df['ratings_summary'] = ta_df.apply(lambda x: x['food'] + x['service'] + x['value'] + x['atmosphere'], axis=1)
# ta_df.sample(5)

Удалим ненужные нам колонки

In [83]:
# ta_df.drop(columns=['id', 'ta_url'], inplace=True)
# ta_df.info()

Сохраним подготовленные данные в формате csv.

In [84]:
# ta_df.to_csv('./data/ta_data.csv', index=False)

Далее будем работать с данными сайта используя выгруженный csv файл.

In [85]:
ta_df = pd.read_csv(DATA_DIR + '/ta_data.csv')
ta_df.info()
ta_df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ta_id             40000 non-null  object 
 1   food              40000 non-null  float64
 2   service           40000 non-null  float64
 3   value             40000 non-null  float64
 4   is_verified       40000 non-null  float64
 5   has_phone_number  40000 non-null  float64
 6   has_hours         40000 non-null  float64
 7   has_website       40000 non-null  float64
 8   has_menu          40000 non-null  float64
 9   distance          40000 non-null  float64
 10  has_tcAward       40000 non-null  float64
 11  atmosphere        40000 non-null  float64
 12  distance_isNAN    40000 non-null  int64  
 13  ratings_summary   40000 non-null  float64
dtypes: float64(12), int64(1), object(1)
memory usage: 4.3+ MB


Unnamed: 0,ta_id,food,service,value,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
10881,d6690404,45.0,45.0,45.0,0.0,1.0,1.0,1.0,0.0,0.2,0.0,45.0,0,180.0
22051,d7291391,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,3.7,0.0,0.0,0,0.0
33388,d8819170,50.0,50.0,50.0,0.0,1.0,1.0,1.0,0.0,4.4,0.0,0.0,0,150.0
35655,d5524876,45.0,40.0,50.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0,135.0
22393,d7896519,45.0,45.0,45.0,0.0,1.0,1.0,1.0,0.0,0.5,0.0,0.0,0,135.0


Добавим данные полученные с сайта к основному датасету.

In [86]:
def add_site_data(df):
    return pd.merge(df, ta_df, left_on='ID_TA', right_on='ta_id')

data = add_site_data(data)

In [87]:
data.info()
data.sample(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40040 entries, 0 to 40039
Columns: 111 entries, Restaurant_id to ratings_summary
dtypes: datetime64[ns](2), float64(20), int64(48), object(6), uint8(35)
memory usage: 24.9+ MB


Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,...,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
29663,id_20,"['European', 'Scandinavian', 'Vegetarian Frien...",21.0,4.5,2.0,203.0,"[['Excellent food - and super service!', 'Love...",/Restaurant_Review-g189934-d4881224-Reviews-Ga...,d4881224,0,...,0.0,1.0,1.0,1.0,0.0,0.1,1.0,40.0,0,170.0
354,id_7657,['Spanish'],7659.0,2.0,1.0,32.0,"[['Bad tapas, bad service.', 'Apero'], ['06/18...",/Restaurant_Review-g187497-d3179035-Reviews-Ce...,d3179035,0,...,0.0,1.0,1.0,1.0,0.0,0.4,0.0,0.0,0,60.0
325,id_5916,,5925.0,4.0,2.0,56.0,"[['Fab', 'Nice concept - good food'], ['11/22/...",/Restaurant_Review-g186338-d6672773-Reviews-Hi...,d6672773,0,...,0.0,1.0,1.0,1.0,0.0,1.2,0.0,0.0,0,125.0
18318,id_1649,,1653.0,3.5,2.0,4.0,"[['Nice food and athmosphere', 'Mr'], ['05/01/...",/Restaurant_Review-g186605-d743281-Reviews-Bla...,d743281,0,...,0.0,1.0,1.0,1.0,0.0,-1.0,0.0,0.0,1,0.0
18358,id_13654,['Cafe'],13664.0,3.0,2.0,24.0,"[['Afternoon coffev', '""Squashed like sardines...",/Restaurant_Review-g186338-d9729508-Reviews-Be...,d9729508,0,...,0.0,1.0,1.0,1.0,0.0,0.1,0.0,0.0,0,90.0


Удалим все признаки, не являющиесь числовыми.

In [88]:
object_cols = list(data.select_dtypes(include=['object', 'datetime64[ns]', 'timedelta64[ns]']).columns)
data = data.drop(labels=object_cols, axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40040 entries, 0 to 40039
Columns: 103 entries, Ranking to ratings_summary
dtypes: float64(20), int64(48), uint8(35)
memory usage: 22.4 MB


### Нормализация данных

In [89]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

columns_for_scaling = [
    'Number of Reviews', 
    'Ranking', 
    'Review_Time_Delta', 
    'Ranking_Delta', 
    'Reviews_Number_City_Median_Delta', 
    'Cuisine_Style_Count_Delta', 
    'Price_Range_Delta', 
    'Reviews_Number_City_Median_Delta',
    'food',
    'service',
    'value',
    'distance',
    'atmosphere',
    'ratings_summary'
    ]

scaled_df = data[columns_for_scaling]
scaled_df.info()

scaler = StandardScaler()
scaled = scaler.fit_transform(scaled_df)
scaled_df = pd.DataFrame(scaled)
scaled_df.columns = columns_for_scaling

for col in columns_for_scaling:
    data[col] = scaled_df[col]

data.sample(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40040 entries, 0 to 40039
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Number of Reviews                 40040 non-null  float64
 1   Ranking                           40040 non-null  float64
 2   Review_Time_Delta                 40040 non-null  float64
 3   Ranking_Delta                     40040 non-null  float64
 4   Reviews_Number_City_Median_Delta  40040 non-null  float64
 5   Cuisine_Style_Count_Delta         40040 non-null  int64  
 6   Price_Range_Delta                 40040 non-null  float64
 7   Reviews_Number_City_Median_Delta  40040 non-null  float64
 8   food                              40040 non-null  float64
 9   service                           40040 non-null  float64
 10  value                             40040 non-null  float64
 11  distance                          40040 non-null  float64
 12  atmo

Unnamed: 0,Ranking,Rating,Price Range,Number of Reviews,Number_of_Reviews_isNAN,Price_Range_isNAN,Cuisine Style Count,Cuisine_Style_isNAN,Cuisine_American,Cuisine_Chinese,...,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
10124,-0.80386,4.0,2.0,-0.364686,0,1,1,1,0,0,...,0.0,1.0,1.0,1.0,0.0,-0.249606,0.0,-0.728694,0,-1.402547
26642,-0.292963,4.0,2.0,-0.395978,0,0,1,0,0,0,...,0.0,1.0,1.0,1.0,0.0,0.722274,0.0,-0.728694,0,-1.402547
9318,-0.346373,4.5,2.0,-0.395978,0,0,4,0,1,0,...,0.0,1.0,1.0,1.0,0.0,-0.05523,1.0,-0.728694,0,0.740797
16169,-0.043449,3.0,2.0,-0.375116,0,0,2,0,0,0,...,0.0,0.0,0.0,0.0,0.0,-1.221487,0.0,-0.728694,1,-1.402547
3952,-0.591031,1.5,2.0,-0.163027,0,0,2,0,0,0,...,0.0,1.0,1.0,1.0,0.0,0.819463,0.0,0.874726,0,-0.293921
11277,2.423908,3.0,2.0,-0.389024,0,1,1,0,0,0,...,0.0,1.0,1.0,1.0,0.0,0.041958,0.0,-0.728694,0,-1.402547
5073,0.756884,3.5,2.0,0.139462,0,0,3,0,0,0,...,0.0,1.0,1.0,1.0,0.0,0.236334,0.0,1.141963,0,0.814706
21255,-0.455889,4.5,2.0,-0.267333,0,0,2,0,0,0,...,0.0,1.0,1.0,1.0,0.0,0.139146,0.0,-0.728694,0,0.59298
11624,2.779432,3.5,2.0,-0.023952,0,0,6,0,0,0,...,0.0,1.0,1.0,1.0,0.0,0.43071,0.0,1.4092,0,0.666889
21426,-0.401401,4.5,2.0,-0.395978,0,1,1,0,0,0,...,0.0,1.0,1.0,1.0,0.0,3.054788,0.0,-0.728694,0,-1.402547


# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели

In [90]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
X = data.drop(['Rating'], axis = 1)
y = data['Rating']

In [91]:
# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

In [92]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Создаём, обучаем и тестируем модель

In [93]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [94]:
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

In [96]:
def round_of_rating(number):
    return np.round(number * 2) / 2

y_pred = round_of_rating(y_pred)

In [97]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.15514485514485515
