![](https://github.com/dkataiev/skillfactory_rds/raw/master/resources/img/tripadvisor-logo.png)
# Predict TripAdvisor Rating
## В этом соревновании нам предстоит предсказать рейтинг ресторана в TripAdvisor

# Загрузка Pandas и очистка данных

In [111]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/.DS_Store
./data/ta_data.csv
./data/main_task.csv
./data/ta_data/.DS_Store
./data/ta_data/urls_30000-35000_data.json
./data/ta_data/urls_0-5000_data.json
./data/ta_data/urls_5000-10000_data.json
./data/ta_data/urls_15000-20000_data.json
./data/ta_data/urls_25000-30000_data.json
./data/ta_data/urls_35000-40000_data.json
./data/ta_data/urls_20000-25000_data.json
./data/ta_data/urls_10000-15000_data.json
./data/urls/urls_20000-25000.csv
./data/urls/urls_5000-10000.csv
./data/urls/urls_15000-20000.csv
./data/urls/urls_25000-30000.csv
./data/urls/.DS_Store
./data/urls/urls_35000-40000.csv
./data/urls/urls_10000-15000.csv
./data/urls/urls_0-5000.csv
./data/urls/urls_30000-35000.csv


In [112]:
DATA_DIR = './data'

data = pd.read_csv(DATA_DIR + '/main_task.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        26114 non-null  object 
 6   Number of Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.1+ MB


#### Подробнее по признакам:
* `City`: Город 
* `Cuisine Style`: Кухня
* `Ranking`: Ранг ресторана относительно других ресторанов в этом городе
* `Price Range`: Цены в ресторане в 3 категориях
* `Number of Reviews`: Количество отзывов
* `Reviews`: 2 последних отзыва и даты этих отзывов
* `URL_TA`: страница ресторана на 'www.tripadvisor.com' 
* `ID_TA`: ID ресторана в TripAdvisor
* `Rating`: Рейтинг ресторана

In [113]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
8185,id_3,Madrid,"['International', 'Mediterranean', 'European',...",4.0,5.0,$$ - $$$,302.0,"[['Small place with big flavours', 'The best d...",/Restaurant_Review-g187514-d11896546-Reviews-A...,d11896546
7169,id_400,Rome,"['Italian', 'Pizza', 'Mediterranean', 'Vegetar...",401.0,4.0,$$ - $$$,953.0,"[['Good Veal Chops and Tiramisu', ""Two More Ve...",/Restaurant_Review-g187791-d790414-Reviews-La_...,d790414
4519,id_750,Oslo,"['Japanese', 'Sushi', 'Asian', 'Thai', 'Vegeta...",751.0,3.5,$$ - $$$,33.0,"[['Good Calamares', 'Nice place, but avoid the...",/Restaurant_Review-g190479-d8460043-Reviews-Mi...,d8460043
25171,id_3346,Madrid,,3348.0,3.5,,82.0,"[['Expensive', 'Excellent Andalusian food ... ...",/Restaurant_Review-g187514-d8874048-Reviews-Bi...,d8874048
31982,id_2144,Paris,"['French', 'Cafe', 'European', 'Vegetarian Fri...",2145.0,4.5,$,79.0,"[['Breakfast of note', 'One of the best breakf...",/Restaurant_Review-g187147-d9762741-Reviews-Ba...,d9762741


#### Number of Reviews

In [114]:
def preproc_number_of_reviews(df):
    # Вынесем информацию о наличии пропуска как отдельный признак
    df['Number_of_Reviews_isNAN'] = pd.isna(df['Number of Reviews']).astype('uint8')
    # Далее заполняем пропуски 0, вы можете попробовать заполнением средним или средним по городу и тд...
    df['Number of Reviews'].fillna(0, inplace=True)
    return df

data = preproc_number_of_reviews(data)

#### Price Range

In [115]:
data['Price Range'].value_counts()

$$ - $$$    18412
$            6279
$$$$         1423
Name: Price Range, dtype: int64

По описанию 'Price Range' это - Цены в ресторане.
Их можно поставить по возрастанию (значит это не категориальный признак). А это значит, что их можно заменить последовательными числами, например 1,2,3

In [116]:
def preproc_price_range(df):
    # Заменим категориальный признак числами
    df['Price Range'].replace(to_replace={ '$': 1, '$$ - $$$': 2, '$$$$': 3}, inplace=True)
    # Сохраним данные о записях, где не был указан уровень цен.
    df['Price_Range_isNAN'] = pd.isna(df['Price Range']).astype('uint8')
    # Заполним пропуски, пока что воспользуемся просто медианой.
    df['Price Range'].fillna(df['Price Range'].median(), inplace=True)
    return df

data = preproc_price_range(data)

In [117]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,Price_Range_isNAN
20550,id_10326,London,"['Indian', 'Asian', 'Thai', 'Delicatessen', 'H...",10335.0,4.5,1.0,3.0,"[['UNEXPECTED SURPRISE', 'Delicious, healthy f...",/Restaurant_Review-g186338-d6936712-Reviews-Th...,d6936712,0,0
11340,id_104,Rome,"['Italian', 'International', 'Mediterranean', ...",105.0,4.5,2.0,452.0,"[['Unique tastes', 'Dinner'], ['12/24/2017', '...",/Restaurant_Review-g187791-d7163010-Reviews-Ob...,d7163010,0,0
1448,id_825,Krakow,,826.0,4.5,2.0,10.0,"[['Incredible Value', 'Great choice not only f...",/Restaurant_Review-g274772-d10872699-Reviews-C...,d10872699,0,1
21634,id_1967,Hamburg,,1971.0,4.0,2.0,3.0,"[[], []]",/Restaurant_Review-g187331-d5528095-Reviews-Cr...,d5528095,0,1
33734,id_5481,Berlin,['Asian'],5483.0,3.5,2.0,4.0,"[['Deliciously Japanese restaurant'], ['07/28/...",/Restaurant_Review-g187323-d6973151-Reviews-Fo...,d6973151,0,1


#### Cuisine Style

Для начала добавим к записям признак с колличеством представленых типов кухонь. Если в данных отсутствует информация о типах кухонь, то считаем, что в этом ресторане предлагается только один тип кухни. Так же пометим записи, где не укзан тип кухни.

In [118]:
def string_to_list(s):
    return s[1:-1].replace("'", "").split(', ')

def count_styles(s):
    return len(string_to_list(s)) if s is not np.nan else 1

def get_cuisines_list(df):
    cuisines = set()
    for cuisine in df['Cuisine Style'].dropna().values:
        cuisines.update(string_to_list(cuisine))
    return cuisines

def update_cuisines_rating(cuisine_styles, cuisines_ratings):
        for style in cuisine_styles:
            cuisines_ratings[style] = cuisines_ratings[style] + 1

def calc_cuisines_ratings(df, cuisines):
    cuisines_ratings = {}
    for cuisine in cuisines:
        cuisines_ratings[cuisine] = 0
    
    for cuisine in df['Cuisine Style'].dropna().values:
        update_cuisines_rating(string_to_list(cuisine), cuisines_ratings)
    return cuisines_ratings

def preproc_cuisine_style(df):
    # добавим к записям признак с колличеством представленых типов кухонь
    df['Cuisine Style Count'] = df['Cuisine Style'].apply(count_styles)
    # пометим записи, где не укзан тип кухни.
    df['Cuisine_Style_isNAN'] = pd.isna(df['Cuisine Style']).astype('uint8')

    # отдельными признаками добавим типы кухни, которые представлены хотя бы в 1% ресторанов.
    # соберем список всех кухонь представленых в датасете.
    cuisines_list = get_cuisines_list(df)

    # соберем данные о том, в скольки ресторанах представлена каждая кухня.
    cuisines_rating = calc_cuisines_ratings(df, cuisines_list)

    # выделим две группы кухонь те которые представлены хотя бы в 1% ресторанов и остальные
    top_cuisines = []
    low_cuisines = []
    for cuisine in cuisines_rating:
        if cuisines_rating[cuisine] > len(data) * .01:
            top_cuisines.append(cuisine)
        else:
            low_cuisines.append(cuisine)

    # создадим отдельные признаки для кухонь представленых хотя бы в 1%
    def top_cuisine_present(x):
        if pd.isna(x): 
            return 0
        return 1 if cuisine in x else 0

    for cuisine in top_cuisines:
        dummy = df['Cuisine Style'].apply(top_cuisine_present)
        df['Cuisine_' + cuisine] = dummy

    # для остальных добавим признак Cuisine_Other
    def is_low_cuisine_present(x):
        if pd.isna(x): 
            return 0
        for cuisine in low_cuisines:
            if cuisine in x:
                return 1
        return 0

    df['Cuisine_Other'] = df['Cuisine Style'].apply(is_low_cuisine_present)
    
    return df

data = preproc_cuisine_style(data)

In [119]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,...,Cuisine_French,Cuisine_Central European,Cuisine_Bar,Cuisine_Vegetarian Friendly,Cuisine_Fast Food,Cuisine_Gastropub,Cuisine_Cafe,Cuisine_International,Cuisine_Turkish,Cuisine_Other
7376,id_1359,Prague,"['European', 'Czech']",1360.0,4.5,2.0,11.0,"[['Excellent food', 'delicious food, good serv...",/Restaurant_Review-g274707-d7123243-Reviews-As...,d7123243,...,0,0,0,0,0,0,0,0,0,0
20184,id_2532,Madrid,,2534.0,4.5,2.0,16.0,"[[], []]",/Restaurant_Review-g187514-d11914658-Reviews-L...,d11914658,...,0,0,0,0,0,0,0,0,0,0
24857,id_2776,Brussels,"['French', 'Dutch', 'European']",2777.0,2.5,3.0,2.0,[['Great for Group events and a wondeful ceil....,/Restaurant_Review-g188644-d1468200-Reviews-Sa...,d1468200,...,1,0,0,0,0,0,0,0,0,1
37046,id_136,Stockholm,"['European', 'Swedish', 'Scandinavian', 'Veget...",137.0,4.0,2.0,458.0,"[['Good food, friendly service', 'Reliable res...",/Restaurant_Review-g189852-d1626458-Reviews-Dr...,d1626458,...,0,0,0,1,0,0,0,0,0,1
29873,id_11312,Paris,"['French', 'Lebanese', 'Mediterranean']",11314.0,3.5,2.0,27.0,"[[], []]",/Restaurant_Review-g187147-d1008641-Reviews-Pa...,d1008641,...,1,0,0,0,0,0,0,0,0,1


#### Reviews

Добавим признаки с датами последнего и предпоследнего отзывов, а также колличество дней между ними.

In [120]:
def get_dates_list(x):
    dates = x.split('], [')[1]
    dates = dates[:-2]
    dates = dates.replace("'", "")
    return dates.split(', ')

def get_penultimate_review(x):
    dates = get_dates_list(x)
    return pd.to_datetime(dates[-1]) if len(dates) >= 1 else np.NaN

def get_last_review(x):
    dates = get_dates_list(x)
    return pd.to_datetime(dates[0]) if len(dates) >= 1 else np.NaN

def preproc_reviews(df):
    # заполним пропуски пустыми значениями
    df['Reviews'].fillna('[[], []]', inplace=True)

    # отметим записи коментарии в которых отсутствуют
    df['Reviews_NA'] = (df['Reviews'] == '[[], []]').astype(int)

    # создадим признаки с датами двух последних коментариев
    df['Penultimate_Review'] = df['Reviews'].apply(lambda x: get_penultimate_review(x))
    df['Last_Review'] = df['Reviews'].apply(lambda x: get_last_review(x))

    # создадим признак количеством дней прошедшими между двумя последними коментариями
    df = df.assign(Review_Time_Delta = lambda x: (x['Last_Review'] - x['Penultimate_Review']).dt.days)

    # звполним пустые записи
    df['Review_Time_Delta'] = df['Review_Time_Delta'].fillna(0)
    return df

data = preproc_reviews(data)

In [121]:
data.sample(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,...,Cuisine_Fast Food,Cuisine_Gastropub,Cuisine_Cafe,Cuisine_International,Cuisine_Turkish,Cuisine_Other,Reviews_NA,Penultimate_Review,Last_Review,Review_Time_Delta
16873,id_1656,Madrid,"['Irish', 'Bar', 'European', 'Pub', 'Internati...",1658.0,4.0,2.0,223.0,"[['Great atmosphere', 'Great atmosphere and th...",/Restaurant_Review-g187514-d2490533-Reviews-Th...,d2490533,...,0,0,0,1,0,1,0,2017-11-19,2017-12-18,29.0
15327,id_1163,London,"['Caribbean', 'Barbecue', 'Jamaican', 'Vegetar...",1165.0,4.0,2.0,921.0,"[['Great food, atmosphere and service - will.....",/Restaurant_Review-g186338-d9558826-Reviews-Le...,d9558826,...,0,0,0,0,0,1,0,2017-12-28,2018-01-03,6.0
39667,id_638,Prague,"['Bar', 'European', 'Central European']",639.0,4.0,2.0,207.0,"[['Nice place for the sunset', 'Snacks, drinks...",/Restaurant_Review-g274707-d4209851-Reviews-Du...,d4209851,...,0,0,0,0,0,0,0,2017-07-16,2017-10-06,82.0
10673,id_1350,Barcelona,"['Mediterranean', 'European', 'Spanish', 'Vege...",1351.0,4.0,2.0,194.0,"[['Good weekdays menu, improvable variety', 'p...",/Restaurant_Review-g187497-d8764109-Reviews-Sa...,d8764109,...,0,0,0,0,0,0,0,2017-06-19,2017-11-08,142.0
1184,id_15134,London,,15145.0,3.5,2.0,4.0,"[['Just because is Subway...', 'Best Customer ...",/Restaurant_Review-g186338-d4919421-Reviews-Su...,d4919421,...,0,0,0,0,0,0,0,2016-10-18,2017-03-11,144.0


Попробуем по тексту двух последних коментариев понять были ли они положительными либо отрицательными.

In [122]:
data['Reviews'].sample(5)

395      [['Guinness!', 'Great Irish Pub in the heart o...
7849     [['A hidden gem with a history'], ['01/14/2015']]
5574     [['Great place to come with your under 5yo', '...
24776    [['Tasty japanese noodles', 'Mixed experience'...
6604                         [['RawLove'], ['12/18/2017']]
Name: Reviews, dtype: object

In [123]:
chars_to_replace = "',.!?&-+/\""

def get_reviews_words(x):
    reviews = x.split('], [')[0]
    reviews = reviews[2:]
    reviews = reviews.split("', '")
    if len(reviews) == 1:
        return []

    words = set()
    for review in reviews:
        for ch in chars_to_replace:
            review = review.replace(ch, '')
        review = review.lower()
    return words

def count_review_words(x):
    words = get_reviews_words(x)

# для начала соберем список слов которые есть в коментариях.
# data['Reviews'].sample(10).apply(count_review_words)

# в процессе выяснилось, что подавляющее большинство коментариев положительные, пока отложим это

#### Cities

In [124]:
# для One-Hot Encoding в pandas есть готовая функция - get_dummies. Особенно радует параметр dummy_na

def preproc_cities(df):

    # Прежде, чем заменять признак 'City' dummy-переменными, сохраним на будущее некоторую информацию.
    # Соберем среднее значение критерия Ranking по городам
    stats_by_city = df.groupby(['City']).median()   
    
    # Для каждой строчки добавим признак с отклонением от среднего значения соответствующего города
    df['Ranking_Delta'] = df.apply(lambda x: x['Ranking'] - stats_by_city['Ranking'][x['City']], axis=1)
    df['Cuisine_Style_Count_Delta'] = df.apply(lambda x: x['Cuisine Style Count'] - stats_by_city['Cuisine Style Count'][x['City']], axis=1)
    df['Price_Range_Delta'] = df.apply(lambda x: x['Price Range'] - stats_by_city['Price Range'][x['City']], axis=1)
    df['Reviews_Number_City_Median_Delta'] = df.apply(lambda x: x['Number of Reviews'] - stats_by_city['Number of Reviews'][x['City']], axis=1)

    df = pd.get_dummies(df, columns=['City'], dummy_na=True)
    return df

data = preproc_cities(data)

In [125]:
data.sample(5)

Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,...,City_Oporto,City_Oslo,City_Paris,City_Prague,City_Rome,City_Stockholm,City_Vienna,City_Warsaw,City_Zurich,City_nan
35802,id_88,"['Italian', 'Pizza', 'Mediterranean', 'Europea...",89.0,4.5,2.0,1165.0,"[['Excelent service and food!', 'Amazing Itali...",/Restaurant_Review-g187497-d8443930-Reviews-Re...,d8443930,0,...,0,0,0,0,0,0,0,0,0,0
33121,id_936,"['Soups', 'Vegan Options', 'Vegetarian Friendly']",937.0,4.5,1.0,26.0,"[['Perfect healthy breakfast', 'Tasty!'], ['05...",/Restaurant_Review-g190454-d7722822-Reviews-Li...,d7722822,0,...,0,0,0,0,0,0,1,0,0,0
17223,id_459,['Mexican'],460.0,3.5,2.0,6.0,"[['Very mediocre'], ['11/01/2017']]",/Restaurant_Review-g190356-d12997695-Reviews-G...,d12997695,0,...,0,0,0,0,0,0,0,0,0,0
4482,id_1102,"['European', 'Portuguese']",1107.0,4.0,1.0,9.0,"[['A very good ""Francesinha"", indeed'], ['03/3...",/Restaurant_Review-g188057-d8811565-Reviews-Kt...,d8811565,0,...,0,0,0,0,0,0,0,0,0,0
20056,id_1049,"['French', 'Belgian', 'European', 'Vegetarian ...",1050.0,3.5,2.0,499.0,"[['Nice, cosy, great loction, nice books, goo....",/Restaurant_Review-g188644-d967804-Reviews-Coo...,d967804,0,...,0,0,0,0,0,0,0,0,0,0


### Это "жжж" неспроста

В датасете у нас есть ссылка страничку ресторана в системе TripAdrisor. Рассмотрим несколько страниц и попробуем выделить потенциально полезную нам информацию.

Опишем необходимые для парсинга функции

In [126]:
import requests

from lxml import html
from bs4 import BeautifulSoup
from multiprocessing import Pool
from datetime import datetime

TRIP_ADVISOR_URL_TEMPLATE = 'https://www.tripadvisor.com{}'

def parse_ratings_and_reviews(node, result):
    rating_block = node.find('div').findAll('div', recursive=False)[2]
    rating_block = rating_block.findAll('div', recursive=False)
    if len(rating_block) < 2:
        return

    ratings = rating_block[1].findAll('div')
    for rating in ratings:
        spans = rating.findAll('span', recursive=False)
        title = spans[1].text.lower()
        value = spans[2].find('span').attrs['class'][1].split('_')[1]
        result[title] = int(value)

def parse_location_and_contact(node):
    location_block = node.find('div').find('div')
    location_block = location_block.findAll('div', recursive=False)[1]
    distance_el = location_block.find('b')
    if distance_el is None:
        return np.NaN
    return float(distance_el.text.split()[0])

def parse_details_block(node, result):
    if node is None:
        return

    result['is_verified'] = 1 if node.find('span', {'class': 'ui_icon verified-checkmark'}) is not None else 0
    result['has_phone_number'] = 1 if node.find('a', string='+ Add phone number') is None else 0
    result['has_hours'] = 1 if node.find('a', string='+ Add hours') is None else 0
    result['has_website'] = 1 if node.find('a', string='+ Add website') is None else 0
    result['has_menu'] = 1 if node.find('a', string='Menu') is not None else 0
        

def collect_page_data(html, result):
    soup = BeautifulSoup(html)
    overview_tabs = soup.find('div', {'data-tab': 'TABS_OVERVIEW'})
    if overview_tabs is None:
        return

    overview_columns = overview_tabs.findAll('div', {'class':'ui_column'})
    parse_ratings_and_reviews(overview_columns[0], result)
    parse_details_block(overview_columns[1], result)

    result['distance'] = parse_location_and_contact(overview_columns[2])
    result['has_tcAward'] = 1 if soup.find('img', {'class': 'tcAward'}) is not None else 0

def current_time():
    return datetime.now().strftime("%H:%M:%S")

def get_id_from_url(url):
    return url.split('-')[2]

def parse_site_page(url):
    result = {}
    result['id_ta'] = get_id_from_url(url)
    ta_url = TRIP_ADVISOR_URL_TEMPLATE.format(url)
    print(url)
    r = requests.get(ta_url, stream=True)
    print('Done')
    collect_page_data(r.text, result)
    return result

def process_ta_urls(x):
    return x.apply(parse_site_page)

In [127]:
def parallelize_processing(df, func, n_cores=8):
    pool = Pool(n_cores)
    df_split = np.array_split(df, n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# print("Started at:", current_time())

# start = 0
# step = 5000
# stop = len(data)
# n_cores = 16

# for i in range(start, stop, step):
#     print('[{}] Getting records {}:{}'.format(current_time(), i, i + step))
#     pages_data = parallelize_processing(data['URL_TA'][i:i+step], process_ta_urls, n_cores=n_cores)
#     site_df = pd.DataFrame(pages_data.values.tolist())
#     site_df.to_csv('ta_data/data_{}-{}.csv'.format(i, i+step))    

# print("Finished at:", current_time())

Все было прекрасно, но jupyther notebook вешается при попытке обработать все ссылки.

Вынесем код в отдельный скрипт и запустим.

![](https://github.com/dkataiev/skillfactory_rds/raw/master/resources/img/several-song-filled-hours-later.jpg)

Выгрузим ссылки на страницы в отдельные файлы для работы граббера.

In [128]:
# start = 0
# step = 5000
# stop = len(df)

# for i in range(start, stop, step):
#     df[['ID_TA','URL_TA']][i:i+step].to_csv('./data/urls/urls_{}-{}.csv'.format(i, i+step), header=False)

Превратим набор JSON-файлов в CSV-формат для дальнейшего удобства рабты с ними.

In [129]:
# import json

# chunks = []

# for dirname, _, filenames in os.walk(DATA_DIR + '/ta_data'):
#     for filename in filenames:
#         if not filename.endswith('.json'):
#             continue

#         with open(os.path.join(dirname, filename)) as f:
#             data_chunk = json.load(f)
#             chunks.append(pd.DataFrame(data_chunk))

# ta_df = pd.concat(chunks, ignore_index=True)
# ta_df.info()
# ta_df.sample(10)

### Почистим полученные данные и подготовим их к дальнейшей работе.

#### Заполним пропуски

Признаки 
* food
* service
* value
* atmosphere 

содержат оценку в конкретной категории. На их основе мы вычислим суммарный рейтинг, поэтому забьем пропуски нулями.

Слудующие признаки содержат номинативные показатели, нет значения - нет показателя, забиваем нолями.

* is_verified
* has_phone_number
* has_hours
* has_website
* has_menu
* has_tcAward

In [130]:
# ta_df.fillna({
#     'food':0,
#     'service':0,
#     'value':0,
#     'atmosphere':0,
#     'is_verified':0,
#     'has_phone_number':0,
#     'has_hours':0,
#     'has_website':0,
#     'has_menu':0,
#     'has_tcAward':0},
#     inplace=True)

# ta_df.info()

С признаком 'distance' все немного сложнее, это раастояние от некоего культурного центра.

Пометим записи, где он отсутствует новым признаком а потом заменим пропуски на -1 что бы отличать от реального расстояния

In [131]:
# ta_df['distance_isNAN'] = pd.isna(ta_df['distance']).astype('uint8')
# ta_df.fillna({'distance':-1},inplace=True)
# ta_df.info()

In [132]:
ta_df.sample(10)

Unnamed: 0,ta_id,food,service,value,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
1732,d12864284,30.0,35.0,30.0,0.0,1.0,1.0,1.0,0.0,1.9,0.0,0.0,0,95.0
37654,d10497791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,1,0.0
39483,d6896759,50.0,50.0,50.0,0.0,1.0,1.0,1.0,0.0,1.2,1.0,0.0,0,150.0
9538,d7254835,40.0,40.0,40.0,0.0,1.0,1.0,1.0,0.0,0.2,0.0,40.0,0,160.0
33380,d12048697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,1,0.0
13309,d10632835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,1,0.0
16594,d8560191,45.0,40.0,40.0,0.0,1.0,1.0,1.0,0.0,1.9,0.0,0.0,0,125.0
35952,d695177,40.0,35.0,35.0,0.0,1.0,1.0,1.0,0.0,0.4,0.0,40.0,0,150.0
23192,d3961527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,1,0.0
11370,d4307654,40.0,40.0,40.0,0.0,1.0,1.0,1.0,0.0,0.8,0.0,30.0,0,150.0


Добавим суммарный рейтинг на основе следующих признаков:

* food
* service
* value
* atmosphere

In [133]:
# ta_df['ratings_summary'] = ta_df.apply(lambda x: x['food'] + x['service'] + x['value'] + x['atmosphere'], axis=1)
# ta_df.sample(5)

Удалим ненужные нам колонки

In [134]:
# ta_df.drop(columns=['id', 'ta_url'], inplace=True)
# ta_df.info()

Сохраним подготовленные данные в формате csv.

In [135]:
# ta_df.to_csv('./data/ta_data.csv', index=False)

Далее будем работать с данными сайта используя выгруженный csv файл.

In [136]:
ta_df = pd.read_csv(DATA_DIR + '/ta_data.csv')
ta_df.info()
ta_df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ta_id             40000 non-null  object 
 1   food              40000 non-null  float64
 2   service           40000 non-null  float64
 3   value             40000 non-null  float64
 4   is_verified       40000 non-null  float64
 5   has_phone_number  40000 non-null  float64
 6   has_hours         40000 non-null  float64
 7   has_website       40000 non-null  float64
 8   has_menu          40000 non-null  float64
 9   distance          40000 non-null  float64
 10  has_tcAward       40000 non-null  float64
 11  atmosphere        40000 non-null  float64
 12  distance_isNAN    40000 non-null  int64  
 13  ratings_summary   40000 non-null  float64
dtypes: float64(12), int64(1), object(1)
memory usage: 4.3+ MB


Unnamed: 0,ta_id,food,service,value,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
35318,d2708382,40.0,40.0,45.0,0.0,1.0,1.0,1.0,0.0,-1.0,0.0,40.0,1,165.0
34252,d12828008,40.0,35.0,35.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0,110.0
29217,d11914592,40.0,40.0,40.0,0.0,1.0,1.0,1.0,0.0,0.2,0.0,0.0,0,120.0
27716,d1012142,35.0,35.0,30.0,0.0,1.0,1.0,1.0,0.0,-1.0,0.0,30.0,1,130.0
23447,d6276449,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.9,0.0,0.0,0,0.0


Добавим данные полученные с сайта к основному датасету.

In [137]:
def add_site_data(df):
    return pd.merge(df, ta_df, left_on='ID_TA', right_on='ta_id')

data = add_site_data(data)

In [138]:
data.info()
data.sample(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40040 entries, 0 to 40039
Columns: 111 entries, Restaurant_id to ratings_summary
dtypes: datetime64[ns](2), float64(20), int64(48), object(6), uint8(35)
memory usage: 24.9+ MB


Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,...,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
14260,id_9887,,9889.0,4.0,2.0,3.0,"[[], []]",/Restaurant_Review-g187147-d11756483-Reviews-L...,d11756483,0,...,0.0,1.0,1.0,1.0,0.0,0.2,0.0,0.0,0,0.0
31628,id_1610,['Contemporary'],1612.0,3.5,1.0,8.0,"[['Expensive'], ['07/09/2015']]",/Restaurant_Review-g189400-d6467818-Reviews-Ch...,d6467818,0,...,0.0,1.0,1.0,1.0,0.0,0.3,0.0,0.0,0,0.0
27729,id_196,"['French', 'Bar', 'Cafe', 'European', 'Pub', '...",197.0,5.0,2.0,28.0,"[[""Nice local bar with 'living room feeling......",/Restaurant_Review-g189934-d7119620-Reviews-Ba...,d7119620,0,...,0.0,1.0,1.0,1.0,0.0,0.6,0.0,0.0,0,140.0
6476,id_1583,,1586.0,4.0,2.0,5.0,"[['Great', 'Very nice view'], ['09/10/2016', '...",/Restaurant_Review-g189852-d5976886-Reviews-19...,d5976886,0,...,0.0,1.0,1.0,1.0,0.0,0.5,0.0,0.0,0,0.0
27597,id_5693,"['French', 'European']",5694.0,3.5,2.0,182.0,"[['Good traditional food', 'Good location, goo...",/Restaurant_Review-g187147-d2049339-Reviews-La...,d2049339,0,...,0.0,1.0,1.0,1.0,0.0,0.3,0.0,0.0,0,105.0


Удалим все признаки, не являющиесь числовыми.

In [139]:
object_cols = list(data.select_dtypes(include=['object', 'datetime64[ns]', 'timedelta64[ns]']).columns)
data = data.drop(labels=object_cols, axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40040 entries, 0 to 40039
Columns: 103 entries, Ranking to ratings_summary
dtypes: float64(20), int64(48), uint8(35)
memory usage: 22.4 MB


### Нормализация данных

In [140]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

columns_for_scaling = [
    'Number of Reviews', 
    'Ranking', 
    'Review_Time_Delta', 
    'Ranking_Delta', 
    'Reviews_Number_City_Median_Delta', 
    'Cuisine_Style_Count_Delta', 
    'Price_Range_Delta', 
    'Reviews_Number_City_Median_Delta',
    'food',
    'service',
    'value',
    'distance',
    'atmosphere',
    'ratings_summary'
    ]

scaled_df = data[columns_for_scaling]
scaled_df.info()

scaler = StandardScaler()
scaled = scaler.fit_transform(scaled_df)
scaled_df = pd.DataFrame(scaled)
scaled_df.columns = columns_for_scaling

for col in columns_for_scaling:
    data[col] = scaled_df[col]

data.sample(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40040 entries, 0 to 40039
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Number of Reviews                 40040 non-null  float64
 1   Ranking                           40040 non-null  float64
 2   Review_Time_Delta                 40040 non-null  float64
 3   Ranking_Delta                     40040 non-null  float64
 4   Reviews_Number_City_Median_Delta  40040 non-null  float64
 5   Cuisine_Style_Count_Delta         40040 non-null  int64  
 6   Price_Range_Delta                 40040 non-null  float64
 7   Reviews_Number_City_Median_Delta  40040 non-null  float64
 8   food                              40040 non-null  float64
 9   service                           40040 non-null  float64
 10  value                             40040 non-null  float64
 11  distance                          40040 non-null  float64
 12  atmo

Unnamed: 0,Ranking,Rating,Price Range,Number of Reviews,Number_of_Reviews_isNAN,Price_Range_isNAN,Cuisine Style Count,Cuisine_Style_isNAN,Cuisine_American,Cuisine_Chinese,...,is_verified,has_phone_number,has_hours,has_website,has_menu,distance,has_tcAward,atmosphere,distance_isNAN,ratings_summary
4386,1.116454,3.0,2.0,-0.295148,0,1,1,1,0,0,...,0.0,1.0,1.0,1.0,0.0,0.722274,0.0,-0.728694,0,-0.220012
5538,-0.567024,3.5,1.0,-0.336871,0,0,1,0,0,0,...,0.0,1.0,1.0,1.0,0.0,1.694155,0.0,-0.728694,0,0.223438
12874,-0.476929,3.0,2.0,-0.333394,0,0,2,0,0,0,...,0.0,0.0,0.0,0.0,0.0,-1.221487,0.0,-0.728694,1,-1.402547
9918,-0.429454,4.5,2.0,-0.295148,0,1,1,1,0,0,...,0.0,1.0,1.0,1.0,0.0,0.333522,0.0,-0.728694,0,0.740797
17814,-0.48637,4.5,2.0,-0.357732,0,0,3,0,0,0,...,0.0,1.0,1.0,1.0,0.0,-1.221487,0.0,-0.728694,1,0.445163
15021,-0.598315,3.5,2.0,-0.399454,0,1,1,1,0,0,...,0.0,1.0,1.0,1.0,0.0,-1.221487,0.0,-0.728694,1,-1.402547
23808,-0.259785,4.0,2.0,-0.406408,1,1,1,1,0,0,...,0.0,1.0,1.0,1.0,0.0,-1.221487,0.0,-0.728694,1,-1.402547
35728,-0.544905,4.5,2.0,-0.371639,0,1,1,1,0,0,...,0.0,1.0,1.0,1.0,0.0,0.041958,0.0,-0.728694,0,0.519072
15936,-0.942239,4.0,2.0,3.021794,0,0,6,0,0,0,...,0.0,1.0,1.0,1.0,0.0,-1.221487,1.0,1.4092,1,1.036431
30472,0.483902,4.5,2.0,-0.378593,0,1,1,1,0,0,...,0.0,1.0,1.0,1.0,0.0,3.443541,0.0,-0.728694,0,-1.402547


# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели

In [141]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
X = data.drop(['Rating'], axis = 1)
y = data['Rating']

In [142]:
# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

In [143]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Создаём, обучаем и тестируем модель

In [144]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [145]:
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

Округлим рейтинг до .5

In [146]:
def round_of_rating(number):
    return np.round(number * 2) / 2

y_pred = round_of_rating(y_pred)

In [147]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.1557942057942058
