![](https://github.com/dkataiev/skillfactory_rds/raw/master/resources/img/tripadvisor-logo.png)
# Predict TripAdvisor Rating
## В этом соревновании нам предстоит предсказать рейтинг ресторана в TripAdvisor

# Загрузка Pandas и очистка данных

In [34]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/.DS_Store
./data/main_task.csv
./data/ta_data/.DS_Store
./data/ta_data/urls_30000-35000_data.json
./data/ta_data/urls_0-5000_data.json
./data/ta_data/urls_5000-10000_data.json
./data/ta_data/urls_15000-20000_data.json
./data/ta_data/urls_25000-30000_data.json
./data/ta_data/urls_35000-40000_data.json
./data/ta_data/urls_20000-25000_data.json
./data/ta_data/urls_10000-15000_data.json
./data/urls/urls_20000-25000.csv
./data/urls/urls_5000-10000.csv
./data/urls/urls_15000-20000.csv
./data/urls/urls_25000-30000.csv
./data/urls/.DS_Store
./data/urls/urls_35000-40000.csv
./data/urls/urls_10000-15000.csv
./data/urls/urls_0-5000.csv
./data/urls/urls_30000-35000.csv


In [35]:
DATA_DIR = './data'

data = pd.read_csv(DATA_DIR + '/main_task.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        26114 non-null  object 
 6   Number of Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.1+ MB


#### Подробнее по признакам:
* `City`: Город 
* `Cuisine Style`: Кухня
* `Ranking`: Ранг ресторана относительно других ресторанов в этом городе
* `Price Range`: Цены в ресторане в 3 категориях
* `Number of Reviews`: Количество отзывов
* `Reviews`: 2 последних отзыва и даты этих отзывов
* `URL_TA`: страница ресторана на 'www.tripadvisor.com' 
* `ID_TA`: ID ресторана в TripAdvisor
* `Rating`: Рейтинг ресторана

In [36]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
1410,id_548,Berlin,"['Italian', 'Pizza', 'Mediterranean', 'Europea...",549.0,4.0,$$ - $$$,149.0,"[['Great service and food', 'Great pasta'], ['...",/Restaurant_Review-g187323-d3488960-Reviews-Re...,d3488960
5233,id_207,Budapest,"['European', 'Eastern European', 'Central Euro...",208.0,4.0,$$$$,1819.0,"[['Tourists restaurant great local food', 'Lov...",/Restaurant_Review-g274887-d712548-Reviews-Spo...,d712548
8413,id_1751,Madrid,['Mediterranean'],1753.0,4.0,$$ - $$$,94.0,"[['Good tapas and wine, lunch average, servic....",/Restaurant_Review-g187514-d2194044-Reviews-Ja...,d2194044
4404,id_678,Warsaw,['Pizza'],679.0,4.5,$,21.0,"[['Family visit.', 'Football'], ['11/08/2017',...",/Restaurant_Review-g274856-d8272690-Reviews-Pi...,d8272690
32830,id_5375,Barcelona,"['Italian', 'Mediterranean', 'Pizza']",5376.0,4.0,$$ - $$$,51.0,"[['Superb Cannelloni', 'Still excellent'], ['0...",/Restaurant_Review-g187497-d1224143-Reviews-I_...,d1224143


#### Number of Reviews

In [37]:
def preproc_number_of_reviews(df):
    # Вынесем информацию о наличии пропуска как отдельный признак
    df['Number_of_Reviews_isNAN'] = pd.isna(df['Number of Reviews']).astype('uint8')
    # Далее заполняем пропуски 0, вы можете попробовать заполнением средним или средним по городу и тд...
    df['Number of Reviews'].fillna(0, inplace=True)
    return df

data = preproc_number_of_reviews(data)

#### Price Range

In [38]:
data['Price Range'].value_counts()

$$ - $$$    18412
$            6279
$$$$         1423
Name: Price Range, dtype: int64

По описанию 'Price Range' это - Цены в ресторане.
Их можно поставить по возрастанию (значит это не категориальный признак). А это значит, что их можно заменить последовательными числами, например 1,2,3

In [39]:
def preproc_price_range(df):
    # Заменим категориальный признак числами
    df['Price Range'].replace(to_replace={ '$': 1, '$$ - $$$': 2, '$$$$': 3}, inplace=True)
    # Сохраним данные о записях, где не был указан уровень цен.
    df['Price_Range_isNAN'] = pd.isna(df['Price Range']).astype('uint8')
    # Заполним пропуски, пока что воспользуемся просто медианой.
    df['Price Range'].fillna(df['Price Range'].median(), inplace=True)
    return df

data = preproc_price_range(data)

In [40]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,Price_Range_isNAN
37529,id_1835,Brussels,,1836.0,4.5,2.0,4.0,"[['Good food', 'Quality without compromise'], ...",/Restaurant_Review-g188644-d4149712-Reviews-Le...,d4149712,0,1
8073,id_1115,Warsaw,,1117.0,5.0,2.0,0.0,"[['For early travellers'], ['04/10/2017']]",/Restaurant_Review-g274856-d12327525-Reviews-G...,d12327525,1,1
36541,id_7984,Madrid,['Japanese'],7991.0,3.0,2.0,21.0,"[[], []]",/Restaurant_Review-g187514-d10155034-Reviews-J...,d10155034,0,1
14943,id_1683,Madrid,"['Mediterranean', 'European', 'Spanish']",1685.0,4.0,2.0,118.0,"[['Good', 'Nice hotel bar - featuring Vieve Cl...",/Restaurant_Review-g187514-d5980819-Reviews-Ga...,d5980819,0,0
15113,id_2299,Milan,['Italian'],2301.0,5.0,2.0,12.0,"[[], []]",/Restaurant_Review-g187849-d7856108-Reviews-Ma...,d7856108,0,1


#### Cuisine Style

Для начала добавим к записям признак с колличеством представленых типов кухонь. Если в данных отсутствует информация о типах кухонь, то считаем, что в этом ресторане предлагается только один тип кухни. Так же пометим записи, где не укзан тип кухни.

In [41]:
def string_to_list(s):
    return s[1:-1].replace("'", "").split(', ')

def count_styles(s):
    return len(string_to_list(s)) if s is not np.nan else 1

def get_cuisines_list(df):
    cuisines = set()
    for cuisine in df['Cuisine Style'].dropna().values:
        cuisines.update(string_to_list(cuisine))
    return cuisines

def update_cuisines_rating(cuisine_styles, cuisines_ratings):
        for style in cuisine_styles:
            cuisines_ratings[style] = cuisines_ratings[style] + 1

def calc_cuisines_ratings(df, cuisines):
    cuisines_ratings = {}
    for cuisine in cuisines:
        cuisines_ratings[cuisine] = 0
    
    for cuisine in df['Cuisine Style'].dropna().values:
        update_cuisines_rating(string_to_list(cuisine), cuisines_ratings)
    return cuisines_ratings

def preproc_cuisine_style(df):
    # добавим к записям признак с колличеством представленых типов кухонь
    df['Cuisine Style Count'] = df['Cuisine Style'].apply(count_styles)
    # пометим записи, где не укзан тип кухни.
    df['Cuisine_Style_isNAN'] = pd.isna(df['Cuisine Style']).astype('uint8')

    # отдельными признаками добавим типы кухни, которые представлены хотя бы в 1% ресторанов.
    # соберем список всех кухонь представленых в датасете.
    cuisines_list = get_cuisines_list(df)

    # соберем данные о том, в скольки ресторанах представлена каждая кухня.
    cuisines_rating = calc_cuisines_ratings(df, cuisines_list)

    # выделим две группы кухонь те которые представлены хотя бы в 1% ресторанов и остальные
    top_cuisines = []
    low_cuisines = []
    for cuisine in cuisines_rating:
        if cuisines_rating[cuisine] > len(data) * .01:
            top_cuisines.append(cuisine)
        else:
            low_cuisines.append(cuisine)

    # создадим отдельные признаки для кухонь представленых хотя бы в 1%
    def top_cuisine_present(x):
        if pd.isna(x): 
            return 0
        return 1 if cuisine in x else 0

    for cuisine in top_cuisines:
        dummy = df['Cuisine Style'].apply(top_cuisine_present)
        df['Cuisine_' + cuisine] = dummy

    # для остальных добавим признак Cuisine_Other
    def is_low_cuisine_present(x):
        if pd.isna(x): 
            return 0
        for cuisine in low_cuisines:
            if cuisine in x:
                return 1
        return 0

    df['Cuisine_Other'] = df['Cuisine Style'].apply(is_low_cuisine_present)
    
    return df

data = preproc_cuisine_style(data)

In [42]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,...,Cuisine_Italian,Cuisine_Sushi,Cuisine_Japanese,Cuisine_Turkish,Cuisine_Contemporary,Cuisine_German,Cuisine_Spanish,Cuisine_French,Cuisine_European,Cuisine_Other
26773,id_7661,Madrid,,7668.0,2.5,2.0,8.0,"[[], []]",/Restaurant_Review-g187514-d5860148-Reviews-La...,d5860148,...,0,0,0,0,0,0,0,0,0,0
15801,id_15932,London,['Cafe'],15944.0,3.0,1.0,21.0,"[['Cheap breakfast', 'Extremely rude staff'], ...",/Restaurant_Review-g186338-d4795072-Reviews-Te...,d4795072,...,0,0,0,0,0,0,0,0,0,0
6934,id_1150,Dublin,['Cafe'],1153.0,5.0,2.0,7.0,[['Best breakfast and lunch option in Beacon.....,/Restaurant_Review-g186605-d4082110-Reviews-Th...,d4082110,...,0,0,0,0,0,0,0,0,0,0
33880,id_184,Edinburgh,"['European', 'British', 'Vegetarian Friendly',...",185.0,4.5,2.0,126.0,"[['A nice surprise....', 'Best restaurant in S...",/Restaurant_Review-g186525-d12116199-Reviews-T...,d12116199,...,0,0,0,0,0,0,0,0,1,0
38817,id_5362,Milan,['International'],5365.0,3.5,2.0,23.0,"[['Good price value', 'Calm Hostel'], ['12/29/...",/Restaurant_Review-g187849-d4074869-Reviews-Go...,d4074869,...,0,0,0,0,0,0,0,0,0,0


#### Reviews

Добавим признаки с датами последнего и предпоследнего отзывов, а также колличество дней между ними.

In [43]:
def get_dates_list(x):
    dates = x.split('], [')[1]
    dates = dates[:-2]
    dates = dates.replace("'", "")
    return dates.split(', ')

def get_penultimate_review(x):
    dates = get_dates_list(x)
    return pd.to_datetime(dates[-1]) if len(dates) >= 1 else np.NaN

def get_last_review(x):
    dates = get_dates_list(x)
    return pd.to_datetime(dates[0]) if len(dates) >= 1 else np.NaN

def preproc_reviews(df):
    # заполним пропуски пустыми значениями
    df['Reviews'].fillna('[[], []]', inplace=True)

    # отметим записи коментарии в которых отсутствуют
    df['Reviews_NA'] = (df['Reviews'] == '[[], []]').astype(int)

    # создадим признаки с датами двух последних коментариев
    df['Penultimate_Review'] = df['Reviews'].apply(lambda x: get_penultimate_review(x))
    df['Last_Review'] = df['Reviews'].apply(lambda x: get_last_review(x))

    # создадим признак количеством дней прошедшими между двумя последними коментариями
    df = df.assign(Review_Time_Delta = lambda x: (x['Last_Review'] - x['Penultimate_Review']).dt.days)

    # звполним пустые записи
    df['Review_Time_Delta'] = df['Review_Time_Delta'].fillna(0)
    return df

data = preproc_reviews(data)

In [44]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,...,Cuisine_Contemporary,Cuisine_German,Cuisine_Spanish,Cuisine_French,Cuisine_European,Cuisine_Other,Reviews_NA,Penultimate_Review,Last_Review,Review_Time_Delta
29515,id_1829,Stockholm,"['Bar', 'Cafe', 'Pub']",1832.0,4.0,2.0,4.0,"[['Well done as a ""café""'], ['09/23/2017']]",/Restaurant_Review-g189852-d12723669-Reviews-E...,d12723669,...,0,0,0,0,0,0,0,2017-09-23,2017-09-23,0.0
17141,id_1137,Vienna,"['Bar', 'International', 'Asian', 'Austrian', ...",1138.0,4.5,2.0,47.0,[['Good support restaurant for the Zeitgeist.....,/Restaurant_Review-g190454-d4496061-Reviews-Lo...,d4496061,...,0,0,0,0,0,1,0,2016-05-26,2017-09-24,486.0
1651,id_1300,Barcelona,"['Healthy', 'Vegetarian Friendly']",1301.0,4.0,2.0,346.0,"[['Great tapas', 'Great Tapas'], ['10/26/2017'...",/Restaurant_Review-g187497-d2703054-Reviews-La...,d2703054,...,0,0,0,0,0,0,0,2017-07-09,2017-10-26,109.0
34648,id_7393,Barcelona,"['Mediterranean', 'Spanish']",7394.0,3.0,2.0,79.0,"[['Average', 'Great Lunch'], ['04/19/2017', '0...",/Restaurant_Review-g187497-d4149781-Reviews-La...,d4149781,...,0,0,1,0,0,0,0,2017-03-23,2017-04-19,27.0
14119,id_1347,Oporto,['Italian'],1348.0,3.0,2.0,41.0,"[['Fresh cooked Pizza', 'very Good for a shoop...",/Restaurant_Review-g189180-d10766764-Reviews-I...,d10766764,...,0,0,0,0,0,0,0,2017-01-29,2017-05-23,114.0


#### Cities

In [45]:
# для One-Hot Encoding в pandas есть готовая функция - get_dummies. Особенно радует параметр dummy_na

def preproc_cities(df):

    # Прежде, чем заменять признак 'City' dummy-переменными, сохраним на будущее некоторую информацию.
    # Соберем среднее значение критерия Ranking по городам
    stats_by_city = df.groupby(['City']).median()   
    
    # Для каждой строчки добавим признак с отклонением от среднего значения соответствующего города
    df['Ranking_Delta'] = df.apply(lambda x: x['Ranking'] - stats_by_city['Ranking'][x['City']], axis=1)
    df['Cuisine_Style_Count_Delta'] = df.apply(lambda x: x['Cuisine Style Count'] - stats_by_city['Cuisine Style Count'][x['City']], axis=1)
    df['Price_Range_Delta'] = df.apply(lambda x: x['Price Range'] - stats_by_city['Price Range'][x['City']], axis=1)
    df['Reviews_Number_City_Median_Delta'] = df.apply(lambda x: x['Number of Reviews'] - stats_by_city['Number of Reviews'][x['City']], axis=1)

    df = pd.get_dummies(df, columns=['City'], dummy_na=True)
    return df

data = preproc_cities(data)

In [46]:
data.sample(5)

Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,...,City_Oporto,City_Oslo,City_Paris,City_Prague,City_Rome,City_Stockholm,City_Vienna,City_Warsaw,City_Zurich,City_nan
28812,id_6391,"['Bar', 'Pub', 'Gastropub']",6400.0,4.0,2.0,45.0,"[['Busy', 'Pre theatre'], ['12/22/2017', '10/1...",/Restaurant_Review-g186338-d4155516-Reviews-Th...,d4155516,0,...,0,0,0,0,0,0,0,0,0,0
38048,id_6965,['Spanish'],6966.0,3.0,2.0,118.0,"[['Drinks with a view', 'Nice beach view'], ['...",/Restaurant_Review-g187497-d4266811-Reviews-Vi...,d4266811,0,...,0,0,0,0,0,0,0,0,0,0
31330,id_2623,"['Dutch', 'Bar', 'Cafe', 'European', 'Pub']",2630.0,3.5,2.0,55.0,"[['Great meeting place.', 'Dutch hospitality']...",/Restaurant_Review-g188590-d2700026-Reviews-Sm...,d2700026,0,...,0,0,0,0,0,0,0,0,0,0
6304,id_5592,,5593.0,4.0,2.0,22.0,"[['Yummy cakes!'], ['10/08/2017']]",/Restaurant_Review-g187147-d11716611-Reviews-S...,d11716611,0,...,0,0,1,0,0,0,0,0,0,0
2079,id_1116,"['Polish', 'European']",1118.0,5.0,2.0,0.0,"[['Cafe like Alice in Wonderland'], ['09/16/20...",/Restaurant_Review-g274856-d12792802-Reviews-K...,d12792802,1,...,0,0,0,0,0,0,0,1,0,0


### Это "жжж" неспроста

В датасете у нас есть ссылка страничку ресторана в системе TripAdrisor. Рассмотрим несколько страниц и попробуем выделить потенциально полезную нам информацию.

Опишем необходимые для парсинга функции

In [47]:
import requests

from lxml import html
from bs4 import BeautifulSoup
from multiprocessing import Pool
from datetime import datetime

TRIP_ADVISOR_URL_TEMPLATE = 'https://www.tripadvisor.com{}'

def parse_ratings_and_reviews(node, result):
    rating_block = node.find('div').findAll('div', recursive=False)[2]
    rating_block = rating_block.findAll('div', recursive=False)
    if len(rating_block) < 2:
        return

    ratings = rating_block[1].findAll('div')
    for rating in ratings:
        spans = rating.findAll('span', recursive=False)
        title = spans[1].text.lower()
        value = spans[2].find('span').attrs['class'][1].split('_')[1]
        result[title] = int(value)

def parse_location_and_contact(node):
    location_block = node.find('div').find('div')
    location_block = location_block.findAll('div', recursive=False)[1]
    distance_el = location_block.find('b')
    if distance_el is None:
        return np.NaN
    return float(distance_el.text.split()[0])

def parse_details_block(node, result):
    if node is None:
        return

    result['is_verified'] = 1 if node.find('span', {'class': 'ui_icon verified-checkmark'}) is not None else 0
    result['has_phone_number'] = 1 if node.find('a', string='+ Add phone number') is None else 0
    result['has_hours'] = 1 if node.find('a', string='+ Add hours') is None else 0
    result['has_website'] = 1 if node.find('a', string='+ Add website') is None else 0
    result['has_menu'] = 1 if node.find('a', string='Menu') is not None else 0
        

def collect_page_data(html, result):
    soup = BeautifulSoup(html)
    overview_tabs = soup.find('div', {'data-tab': 'TABS_OVERVIEW'})
    if overview_tabs is None:
        return

    overview_columns = overview_tabs.findAll('div', {'class':'ui_column'})
    parse_ratings_and_reviews(overview_columns[0], result)
    parse_details_block(overview_columns[1], result)

    result['distance'] = parse_location_and_contact(overview_columns[2])
    result['has_tcAward'] = 1 if soup.find('img', {'class': 'tcAward'}) is not None else 0

def current_time():
    return datetime.now().strftime("%H:%M:%S")

def get_id_from_url(url):
    return url.split('-')[2]

def parse_site_page(url):
    result = {}
    result['id_ta'] = get_id_from_url(url)
    ta_url = TRIP_ADVISOR_URL_TEMPLATE.format(url)
    print(url)
    r = requests.get(ta_url, stream=True)
    print('Done')
    collect_page_data(r.text, result)
    return result

def process_ta_urls(x):
    return x.apply(parse_site_page)

In [48]:
def parallelize_processing(df, func, n_cores=8):
    pool = Pool(n_cores)
    df_split = np.array_split(df, n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# print("Started at:", current_time())

# start = 0
# step = 5000
# stop = len(data)
# n_cores = 16

# for i in range(start, stop, step):
#     print('[{}] Getting records {}:{}'.format(current_time(), i, i + step))
#     pages_data = parallelize_processing(data['URL_TA'][i:i+step], process_ta_urls, n_cores=n_cores)
#     site_df = pd.DataFrame(pages_data.values.tolist())
#     site_df.to_csv('ta_data/data_{}-{}.csv'.format(i, i+step))    

# print("Finished at:", current_time())

Все было прекрасно, но jupyther notebook вешается при попытке обработать все ссылки.

Вынесем код в отдельный скрипт и запустим.

![](https://github.com/dkataiev/skillfactory_rds/raw/master/resources/img/several-song-filled-hours-later.jpg)

Выгрузим ссылки на страницы в отдельные файлы для работы граббера.

In [50]:
# start = 0
# step = 5000
# stop = len(df)

# for i in range(start, stop, step):
#     df[['ID_TA','URL_TA']][i:i+step].to_csv('./data/urls/urls_{}-{}.csv'.format(i, i+step), header=False)

Превратим набор JSON-файлов в CSV-формат для дальнейшего удобства рабты с ними.

In [59]:
# import json

# chunks = []

# for dirname, _, filenames in os.walk(DATA_DIR + '/ta_data'):
#     for filename in filenames:
#         if not filename.endswith('.json'):
#             continue

#         with open(os.path.join(dirname, filename)) as f:
#             data_chunk = json.load(f)
#             chunks.append(pd.DataFrame(data_chunk))

# ta_df = pd.concat(chunks, ignore_index=True)
# ta_df.info()
# ta_df.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                40000 non-null  object 
 1   ta_id             40000 non-null  object 
 2   ta_url            40000 non-null  object 
 3   food              27153 non-null  float64
 4   service           27400 non-null  float64
 5   value             27355 non-null  float64
 6   is_verified       36526 non-null  float64
 7   has_phone_number  36526 non-null  float64
 8   has_hours         36526 non-null  float64
 9   has_website       36526 non-null  float64
 10  has_menu          36526 non-null  float64
 11  distance          28899 non-null  float64
 12  has_tcAward       36526 non-null  float64
 13  atmosphere        14147 non-null  float64
dtypes: float64(11), object(3)
memory usage: 4.3+ MB


Unnamed: 0,id,ta_id,ta_url,food,service,value,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere
11344,6344,d3774113,/Restaurant_Review-g187323-d3774113-Reviews-Sy...,40.0,40.0,40.0,0.0,1.0,1.0,1.0,0.0,3.8,0.0,45.0
19557,19557,d3316388,/Restaurant_Review-g274707-d3316388-Reviews-Bi...,45.0,40.0,40.0,0.0,1.0,1.0,1.0,0.0,0.7,1.0,45.0
14508,9508,d716275,/Restaurant_Review-g187849-d716275-Reviews-La_...,35.0,35.0,30.0,0.0,1.0,1.0,1.0,0.0,0.4,0.0,40.0
5739,739,d8602510,/Restaurant_Review-g274873-d8602510-Reviews-To...,,,,0.0,1.0,1.0,1.0,0.0,,0.0,
28674,38674,d696919,/Restaurant_Review-g188590-d696919-Reviews-De_...,40.0,40.0,35.0,0.0,1.0,1.0,1.0,0.0,0.3,0.0,40.0
31107,21107,d3603158,/Restaurant_Review-g187497-d3603158-Reviews-Pe...,30.0,30.0,30.0,0.0,1.0,1.0,1.0,0.0,0.3,0.0,35.0
510,30510,d10547208,/Restaurant_Review-g187147-d10547208-Reviews-G...,,,,,,,,,,,
39638,14638,d4498891,/Restaurant_Review-g187147-d4498891-Reviews-Le...,,,,,,,,,,,
38758,13758,d3331957,/Restaurant_Review-g187147-d3331957-Reviews-Da...,,,,,,,,,,,
29502,39502,d2429357,/Restaurant_Review-g186338-d2429357-Reviews-Gr...,45.0,45.0,45.0,0.0,1.0,1.0,1.0,0.0,0.1,0.0,30.0


### Почистим полученные данные и подготовим их к дальнейшей работе.

#### Заполним пропуски

Признаки 
* food
* service
* value
* atmosphere 

содержат оценку в конкретной категории. На их основе мы вычислим суммарный рейтинг, поэтому забьем пропуски нулями.

Слудующие признаки содержат номинативные показатели, нет значения - нет показателя, забиваем нолями.

* is_verified
* has_phone_number
* has_hours
* has_website
* has_menu
* has_tcAward

In [60]:
# ta_df.fillna({
#     'food':0,
#     'service':0,
#     'value':0,
#     'atmosphere':0,
#     'is_verified':0,
#     'has_phone_number':0,
#     'has_hours':0,
#     'has_website':0,
#     'has_menu':0,
#     'has_tcAward':0},
#     inplace=True)

# ta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                40000 non-null  object 
 1   ta_id             40000 non-null  object 
 2   ta_url            40000 non-null  object 
 3   food              40000 non-null  float64
 4   service           40000 non-null  float64
 5   value             40000 non-null  float64
 6   is_verified       40000 non-null  float64
 7   has_phone_number  40000 non-null  float64
 8   has_hours         40000 non-null  float64
 9   has_website       40000 non-null  float64
 10  has_menu          40000 non-null  float64
 11  distance          28899 non-null  float64
 12  has_tcAward       40000 non-null  float64
 13  atmosphere        40000 non-null  float64
dtypes: float64(11), object(3)
memory usage: 4.3+ MB


С признаком 'distance' все немного сложнее, это раастояние от некоего культурного центра.

Пометим записи, где он отсутствует новым признаком а потом заменим пропуски на -1 что бы отличать от реального расстояния

In [61]:
# ta_df['distance_isNAN'] = pd.isna(ta_df['distance']).astype('uint8')
# ta_df.fillna({'distance':-1},inplace=True)
# ta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                40000 non-null  object 
 1   ta_id             40000 non-null  object 
 2   ta_url            40000 non-null  object 
 3   food              40000 non-null  float64
 4   service           40000 non-null  float64
 5   value             40000 non-null  float64
 6   is_verified       40000 non-null  float64
 7   has_phone_number  40000 non-null  float64
 8   has_hours         40000 non-null  float64
 9   has_website       40000 non-null  float64
 10  has_menu          40000 non-null  float64
 11  distance          40000 non-null  float64
 12  has_tcAward       40000 non-null  float64
 13  atmosphere        40000 non-null  float64
 14  distance_isNAN    40000 non-null  uint8  
dtypes: float64(11), object(3), uint8(1)
memory usage: 4.3+ MB


In [62]:
ta_df.sample(10)

Unnamed: 0,id,ta_id,ta_url,food,service,value,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN
8114,3114,d9596649,/Restaurant_Review-g274887-d9596649-Reviews-Ri...,40.0,40.0,45.0,0.0,1.0,1.0,1.0,0.0,4.5,0.0,0.0,0
3135,33135,d10700630,/Restaurant_Review-g189180-d10700630-Reviews-C...,45.0,45.0,40.0,0.0,1.0,1.0,1.0,0.0,0.2,0.0,0.0,0
27969,37969,d717324,/Restaurant_Review-g187514-d717324-Reviews-Lha...,40.0,40.0,30.0,0.0,1.0,1.0,1.0,0.0,0.1,0.0,40.0,0
18887,18887,d10671022,/Restaurant_Review-g189541-d10671022-Reviews-M...,0.0,40.0,45.0,0.0,1.0,1.0,1.0,0.0,0.3,0.0,0.0,0
32060,22060,d8006236,/Restaurant_Review-g274772-d8006236-Reviews-Pi...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,4.1,0.0,0.0,0
24836,29836,d7246438,/Restaurant_Review-g274707-d7246438-Reviews-Kf...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,-1.0,0.0,0.0,1
17122,17122,d4733344,/Restaurant_Review-g187849-d4733344-Reviews-Bl...,40.0,45.0,45.0,0.0,1.0,1.0,1.0,0.0,-1.0,0.0,45.0,1
3778,33778,d5774989,/Restaurant_Review-g187147-d5774989-Reviews-Bl...,30.0,35.0,25.0,0.0,1.0,1.0,1.0,0.0,0.5,0.0,35.0,0
16204,16204,d5122694,/Restaurant_Review-g186338-d5122694-Reviews-Fo...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.3,0.0,0.0,0
29884,39884,d8177978,/Restaurant_Review-g187497-d8177978-Reviews-Di...,40.0,40.0,40.0,0.0,1.0,1.0,1.0,0.0,0.2,1.0,0.0,0


Добавим суммарный рейтинг на основе следующих признаков:

* food
* service
* value
* atmosphere

In [63]:
# ta_df['ratings_summary'] = ta_df.apply(lambda x: x['food'] + x['service'] + x['value'] + x['atmosphere'], axis=1)
# ta_df.sample(5)

Unnamed: 0,id,ta_id,ta_url,food,service,value,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
31175,21175,d5959197,/Restaurant_Review-g189180-d5959197-Reviews-Ce...,40.0,35.0,35.0,0.0,1.0,1.0,1.0,0.0,-1.0,0.0,0.0,1,110.0
8175,3175,d3324228,/Restaurant_Review-g190454-d3324228-Reviews-Ap...,45.0,45.0,45.0,0.0,1.0,1.0,1.0,0.0,0.6,1.0,40.0,0,175.0
20870,25870,d7139984,/Restaurant_Review-g186338-d7139984-Reviews-Ro...,40.0,40.0,35.0,0.0,1.0,1.0,1.0,0.0,0.8,1.0,0.0,0,115.0
18803,18803,d2234091,/Restaurant_Review-g187323-d2234091-Reviews-Me...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.4,0.0,0.0,0,0.0
30565,20565,d1010644,/Restaurant_Review-g187791-d1010644-Reviews-Cl...,40.0,40.0,35.0,0.0,1.0,1.0,1.0,0.0,0.1,0.0,35.0,0,150.0


Удалим ненужные нам колонки

In [64]:
# ta_df.drop(columns=['id', 'ta_url'], inplace=True)
# ta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ta_id             40000 non-null  object 
 1   food              40000 non-null  float64
 2   service           40000 non-null  float64
 3   value             40000 non-null  float64
 4   is_verified       40000 non-null  float64
 5   has_phone_number  40000 non-null  float64
 6   has_hours         40000 non-null  float64
 7   has_website       40000 non-null  float64
 8   has_menu          40000 non-null  float64
 9   distance          40000 non-null  float64
 10  has_tcAward       40000 non-null  float64
 11  atmosphere        40000 non-null  float64
 12  distance_isNAN    40000 non-null  uint8  
 13  ratings_summary   40000 non-null  float64
dtypes: float64(12), object(1), uint8(1)
memory usage: 4.0+ MB


Сохраним подготовленные данные в формате csv.

In [65]:
# ta_df.to_csv('./data/ta_data.csv', index=False)

Далее будем работать с данными сайта используя выгруженный csv файл.

In [66]:
ta_df = pd.read_csv(DATA_DIR + '/ta_data.csv')
ta_df.info()
ta_df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ta_id             40000 non-null  object 
 1   food              40000 non-null  float64
 2   service           40000 non-null  float64
 3   value             40000 non-null  float64
 4   is_verified       40000 non-null  float64
 5   has_phone_number  40000 non-null  float64
 6   has_hours         40000 non-null  float64
 7   has_website       40000 non-null  float64
 8   has_menu          40000 non-null  float64
 9   distance          40000 non-null  float64
 10  has_tcAward       40000 non-null  float64
 11  atmosphere        40000 non-null  float64
 12  distance_isNAN    40000 non-null  int64  
 13  ratings_summary   40000 non-null  float64
dtypes: float64(12), int64(1), object(1)
memory usage: 4.3+ MB


Unnamed: 0,ta_id,food,service,value,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
11910,d4427973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,1,0.0
9139,d7622906,45.0,45.0,45.0,0.0,1.0,1.0,1.0,0.0,0.4,0.0,0.0,0,135.0
15405,d10758185,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.1,0.0,0.0,0,0.0
37805,d12883647,40.0,40.0,35.0,0.0,1.0,1.0,1.0,0.0,0.4,1.0,0.0,0,115.0
27096,d2100039,40.0,35.0,40.0,0.0,1.0,1.0,1.0,0.0,0.1,1.0,35.0,0,150.0


Добавим данные полученные с сайта к основному датасету.

In [67]:
def add_site_data(df):
    return pd.merge(df, ta_df, left_on='ID_TA', right_on='ta_id')

data = add_site_data(data)

In [68]:
data.info()
data.sample(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40040 entries, 0 to 40039
Columns: 111 entries, Restaurant_id to ratings_summary
dtypes: datetime64[ns](2), float64(20), int64(48), object(6), uint8(35)
memory usage: 24.9+ MB


Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,...,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
33448,id_8786,,8788.0,4.5,2.0,2.0,"[['Delicious Take Out'], ['10/25/2014']]",/Restaurant_Review-g187147-d7177191-Reviews-Se...,d7177191,0,...,0.0,1.0,1.0,1.0,0.0,-1.0,0.0,0.0,1,0.0
31613,id_11277,"['French', 'International', 'Delicatessen', 'S...",11279.0,3.5,1.0,18.0,"[['Sometimes good, sometimes terrible'], ['07/...",/Restaurant_Review-g187147-d8463035-Reviews-Sh...,d8463035,0,...,0.0,1.0,1.0,1.0,0.0,-1.0,0.0,0.0,1,105.0
15533,id_2758,,2760.0,5.0,2.0,9.0,"[['Great service', 'Good coffee, great cake'],...",/Restaurant_Review-g187323-d8621310-Reviews-Ka...,d8621310,0,...,0.0,1.0,1.0,1.0,0.0,-1.0,0.0,0.0,1,130.0
18020,id_617,,618.0,4.0,2.0,19.0,"[['Great buzz, al fresco dining and good food'...",/Restaurant_Review-g189852-d10844929-Reviews-M...,d10844929,0,...,0.0,1.0,1.0,1.0,0.0,0.7,0.0,0.0,0,125.0
18979,id_5403,"['Bar', 'British', 'Pub']",5412.0,3.5,2.0,189.0,"[['Nice pub fine food and good atmosphere', 'E...",/Restaurant_Review-g186338-d2704239-Reviews-Th...,d2704239,0,...,0.0,1.0,1.0,1.0,0.0,0.6,0.0,35.0,0,140.0


Удалим все признаки, не являющиесь числовыми.

In [69]:
object_cols = list(data.select_dtypes(include=['object', 'datetime64[ns]', 'timedelta64[ns]']).columns)
data = data.drop(labels=object_cols, axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40040 entries, 0 to 40039
Columns: 103 entries, Ranking to ratings_summary
dtypes: float64(20), int64(48), uint8(35)
memory usage: 22.4 MB


### Нормализация данных

In [70]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

columns_for_scaling = [
    'Number of Reviews', 
    'Ranking', 
    'Review_Time_Delta', 
    'Ranking_Delta', 
    'Reviews_Number_City_Median_Delta', 
    'Cuisine_Style_Count_Delta', 
    'Price_Range_Delta', 
    'Reviews_Number_City_Median_Delta',
    'food',
    'service',
    'value',
    'distance',
    'atmosphere',
    'ratings_summary'
    ]

scaled_df = data[columns_for_scaling]
scaled_df.info()

scaler = StandardScaler()
scaled = scaler.fit_transform(scaled_df)
scaled_df = pd.DataFrame(scaled)
scaled_df.columns = columns_for_scaling

for col in columns_for_scaling:
    data[col] = scaled_df[col]

data.sample(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40040 entries, 0 to 40039
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Number of Reviews                 40040 non-null  float64
 1   Ranking                           40040 non-null  float64
 2   Review_Time_Delta                 40040 non-null  float64
 3   Ranking_Delta                     40040 non-null  float64
 4   Reviews_Number_City_Median_Delta  40040 non-null  float64
 5   Cuisine_Style_Count_Delta         40040 non-null  int64  
 6   Price_Range_Delta                 40040 non-null  float64
 7   Reviews_Number_City_Median_Delta  40040 non-null  float64
 8   food                              40040 non-null  float64
 9   service                           40040 non-null  float64
 10  value                             40040 non-null  float64
 11  distance                          40040 non-null  float64
 12  atmo

Unnamed: 0,Ranking,Rating,Price Range,Number of Reviews,Number_of_Reviews_isNAN,Price_Range_isNAN,Cuisine Style Count,Cuisine_Style_isNAN,Cuisine_Mediterranean,Cuisine_Seafood,...,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
3798,3.08775,3.0,2.0,0.831361,0,0,1,0,0,0,...,0.0,1.0,1.0,1.0,0.0,-0.152418,0.0,0.874726,0,0.445163
20213,-0.462093,4.0,2.0,0.021248,0,0,5,0,1,0,...,0.0,1.0,1.0,1.0,0.0,0.527898,0.0,1.4092,0,1.036431
11199,0.24275,4.0,2.0,-0.395978,0,1,1,0,1,0,...,0.0,1.0,1.0,1.0,0.0,0.722274,0.0,-0.728694,0,-1.402547
30556,0.183137,4.5,2.0,-0.312533,0,0,2,0,0,1,...,0.0,0.0,0.0,0.0,0.0,-1.221487,0.0,-0.728694,1,-1.402547
16494,-0.149459,4.5,2.0,-0.079582,0,0,1,0,0,0,...,0.0,1.0,1.0,1.0,0.0,-1.221487,0.0,1.4092,1,1.258156
24840,2.44306,3.0,2.0,-0.392501,0,1,1,0,0,0,...,0.0,1.0,1.0,1.0,0.0,0.333522,0.0,-0.728694,0,-1.402547
16884,-0.55003,4.0,2.0,-0.357732,0,1,1,1,0,0,...,0.0,1.0,1.0,1.0,0.0,-1.221487,0.0,-0.728694,1,0.371255
30297,-0.069075,5.0,1.0,-0.284717,0,0,6,0,1,0,...,0.0,1.0,1.0,1.0,0.0,0.236334,0.0,-0.728694,0,0.666889
27028,0.172886,4.5,2.0,-0.399454,0,1,1,1,0,0,...,0.0,1.0,1.0,1.0,0.0,2.568848,0.0,-0.728694,0,-1.402547
3062,-0.894495,4.0,2.0,-0.395978,0,0,4,0,1,1,...,0.0,1.0,1.0,1.0,0.0,-0.152418,0.0,-0.728694,0,0.666889


# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели

In [71]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
X = data.drop(['Rating'], axis = 1)
y = data['Rating']

KeyError: "['Restaurant_id'] not found in axis"

In [None]:
# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

In [None]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Создаём, обучаем и тестируем модель

In [None]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [None]:
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

In [None]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))