Первоначальная версия датасета состоит из десяти столбцов, содержащих следующую информацию:

1. **Restaurant_id** — идентификационный номер ресторана / сети ресторанов;
2. **City** — город, в котором находится ресторан;
3. **Cuisine Style** — кухня или кухни, к которым можно отнести блюда, предлагаемые в ресторане;
4. **Ranking** — место, которое занимает данный ресторан среди всех ресторанов своего города;
5. **Rating** — рейтинг ресторана по данным TripAdvisor (именно это значение должна будет предсказывать модель);
6. **Price Range** — диапазон цен в ресторане;
7. **Number of Reviews** — количество отзывов о ресторане;
8. **Reviews** — данные о двух отзывах, которые отображаются на сайте ресторана;
9. **URL_TA** — URL страницы ресторана на TripAdvisor;
10. **ID_TA** — идентификатор ресторана в базе данных TripAdvisor.

In [161]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
import re

In [330]:
data = pd.read_csv('main_task_new.csv')
data.head(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        26114 non-null  object 
 6   Number of Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.1+ MB


In [30]:
# Подсчёт количества пустых значений в столбцах датасета

def find_null(col):
    empty_values = data[col].isna().sum()
    print(
            f"В столбце '{col}' количество пропусков = {empty_values}.")
    if empty_values > 0:
        return col

In [15]:
# Просмотр типа данных для каждого столбца датасета
for col in data.columns:
    print(f'Столбец {col} имеет тип данных - {type(data[col][0])}')

Столбец Restaurant_id имеет тип данных - <class 'str'>
Столбец City имеет тип данных - <class 'str'>
Столбец Cuisine Style имеет тип данных - <class 'str'>
Столбец Ranking имеет тип данных - <class 'numpy.float64'>
Столбец Rating имеет тип данных - <class 'numpy.float64'>
Столбец Price Range имеет тип данных - <class 'str'>
Столбец Number of Reviews имеет тип данных - <class 'numpy.float64'>
Столбец Reviews имеет тип данных - <class 'str'>
Столбец URL_TA имеет тип данных - <class 'str'>
Столбец ID_TA имеет тип данных - <class 'str'>


In [16]:
X = data.drop(['Restaurant_id', 'Rating'], axis = 1)
X

Unnamed: 0,City,Cuisine Style,Ranking,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,Paris,"['European', 'French', 'International']",5570.0,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,Stockholm,,1537.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,Berlin,,3458.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963
...,...,...,...,...,...,...,...,...
39995,Milan,"['Italian', 'Vegetarian Friendly', 'Vegan Opti...",500.0,$$ - $$$,79.0,"[['The real Italian experience!', 'Wonderful f...",/Restaurant_Review-g187849-d2104414-Reviews-Ro...,d2104414
39996,Paris,"['French', 'American', 'Bar', 'European', 'Veg...",6341.0,$$ - $$$,542.0,"[['Parisian atmosphere', 'Bit pricey but inter...",/Restaurant_Review-g187147-d1800036-Reviews-La...,d1800036
39997,Stockholm,"['Japanese', 'Sushi']",1652.0,,4.0,"[['Good by swedish standards', 'A hidden jewel...",/Restaurant_Review-g189852-d947615-Reviews-Sus...,d947615
39998,Warsaw,"['Polish', 'European', 'Eastern European', 'Ce...",641.0,$$ - $$$,70.0,"[['Underground restaurant', 'Oldest Restaurant...",/Restaurant_Review-g274856-d1100838-Reviews-Ho...,d1100838


In [24]:
for col in data.columns:
    if data.loc[:,col].dtype == np.dtype('O'):
        data = data.drop(col,axis=1)

In [31]:
for col in data.columns:
    empty_list = []
    empty_list.append(find_null(col))

В столбце 'Ranking' количество пропусков = 0.
В столбце 'Rating' количество пропусков = 0.
В столбце 'Number of Reviews' количество пропусков = 2543.


In [33]:
for i in empty_list:
    data[i].fillna(data[i].median())

In [34]:
data

Unnamed: 0,Ranking,Rating,Number of Reviews
0,5570.0,3.5,194.0
1,1537.0,4.0,10.0
2,353.0,4.5,688.0
3,3458.0,5.0,3.0
4,621.0,4.0,84.0
...,...,...,...
39995,500.0,4.5,79.0
39996,6341.0,3.5,542.0
39997,1652.0,4.5,4.0
39998,641.0,4.0,70.0


In [41]:
data['Price Range'].value_counts()

$$ - $$$    18412
$            6279
$$$$         1423
Name: Price Range, dtype: int64

In [62]:
len(data['City'].value_counts())

31

In [65]:
len(data['Cuisine Style'][0])

39

# 4.3 Вопросы о кухнях

In [279]:
test = data.copy()

In [331]:
#data['Cuisine Style'] = data['Cuisine Style'].apply(lambda x : re.findall(r'\w+',x) if not pd.isna(x) else ['Unknown'])

In [280]:
test['Cuisine Style'] = test['Cuisine Style'].apply(lambda x : re.sub(r"[]['']",'',x) if not pd.isna(x) else 'Unknown')

In [281]:
test.head(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"European, French, International",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,Unknown,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"Japanese, Sushi, Asian, Grill, Vegetarian Frie...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,Unknown,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"German, Central European, Vegetarian Friendly",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963


## Сколько типов кухонь представлено в наборе данных?

In [285]:
t = Counter(test['Cuisine Style'].str.split(', ').sum())

In [393]:
len(t) - 1

125

## Какая кухня представлена в наибольшем количестве ресторанов?

In [394]:
t.most_common(1)

[('Vegetarian Friendly', 11189)]

In [395]:
# data.head(8)

## Какое среднее количество кухонь предлагается в одном ресторане?

In [385]:
test['Cuisine Style'] = test['Cuisine Style'].str.split(', ')

In [398]:
test.head(2)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"[European, French, International]",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,[Unknown],1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032


In [396]:
summ = 0
for i in test['Cuisine Style']:
    summ += len(i)

In [397]:
summ/test.shape[0]

2.6224

# 4.4 Вопросы об отзывах

In [399]:
test['Reviews'][0]

"[['Good food at your doorstep', 'A good hotel restaurant'], ['12/31/2017', '11/20/2017']]"

In [400]:
#test['Reviews'] = test['Reviews'].apply(lambda x : re.sub(r"[]['']",'',x) if not pd.isna(x) else 'Unknown')

In [419]:
test['Reviews'].apply(lambda x : re.sub(r"['][']",'',x))

0        [['Good food at your doorstep', 'A good hotel ...
1        [['Unique cuisine', 'Delicious Nepalese food']...
2        [['Catch up with friends', 'Not exceptional'],...
3                                                 [[], []]
4        [['Best place to try a Bavarian food', 'Nice b...
                               ...                        
39995    [['The real Italian experience!', 'Wonderful f...
39996    [['Parisian atmosphere', 'Bit pricey but inter...
39997    [['Good by swedish standards', 'A hidden jewel...
39998    [['Underground restaurant', 'Oldest Restaurant...
39999    [['Average', 'Nice an informal'], ['01/31/2016...
Name: Reviews, Length: 40000, dtype: object

In [443]:
pattern = re.compile('\d+\/\d+\/\d+')
test['Reviews_date'] = test['Reviews'].apply(lambda x: pattern.findall(x))

In [445]:
test['Reviews_date'][0][0]

'12/31/2017'

In [446]:
test['Reviews'].apply(lambda x : re.sub(r"[\D]",'',x))

0        1231201711202017
1        0706201706192016
2        0108201801062018
3                        
4        1118201702192017
               ...       
39995    1216201711122017
39996    1221201712122017
39997    1103201604122008
39998    0711201706182017
39999    0131201607042012
Name: Reviews, Length: 40000, dtype: object