In [2]:
import statistics

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
train = pd.read_csv('data/train.csv')

rubric_mapping = dict(
    zip(
        pd.unique(train.rubrics),
        np.arange(0, len(pd.unique(train.rubrics)))
    )
)
city_mapping = {'msk': 0, 'spb': 1}

train.rubrics = train.rubrics.apply(lambda it: rubric_mapping[it])
train.user_city = train.user_city.apply(lambda it: city_mapping[it])
train.org_city = train.org_city.apply(lambda it: city_mapping[it])

mean = train.average_bill.mean()
train.average_bill.fillna(mean, inplace=True)
train.fillna(method='ffill', inplace=True)

# All columns except ration_org, user_id, org_id
# Eq to train.drop(['user_id', 'org_id', 'user_city'], axis=1, inplace=True)
columns = ['rating', 'cafe', 'toilet_for_disabled', 'parking_disabled', 'payment_by_credit_card', 'automatic_door', 'wheelchair_access', 'car_park', 'craft_beer', 'gift_certificate', 'food_delivery', 'sports_broadcasts', 'privilege_tickets', 'projector', 'wi_fi', 'food_product_delivery', 'karaoke', 'free_delivery', 'farm_products', 'self_service_kiosks', 'organic_food', 'minimum_order', 'table_games', 'promotions', 'takeaway', 'elevator_wheelchair_accessible', 'average_bill', 'music', 'vip_zone', 'delivery', 'face_control', 'dancefloor', 'show_program', 'rubrics', 'food_court1', 'dress_code', 'coffee_to_go', 'chillout', 'air_conditioning', 'elevator', 'internet access', 'strip', 'wheelchair_accessible', 'handmade_goods', 'pickup', 'has_bar', 'hall_capacity', 'around_the_clock_work1', 'call_button', 'summer_terrace', 'ramp', 'teahouse', 'pets', 'coffee_shop', 'kalyan', 'billiards', 'org_city', 'has_restaurant', 'cakes_for_different_occasions', 'accepted_credit_cards', 'online_takeaway', 'bread_from_tandoor', 'business_lunch', 'user_city', 'vinotheque', 'special_menu', 'closed_for_quarantine', 'breakfast']

x = train[columns]

x = StandardScaler().fit_transform(x)

y = np.array(train.rating_org)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
model = KNeighborsRegressor(n_neighbors=15, weights='distance', p=1)
# model.fit(x, y) # For forecasting

model.fit(x_train, y_train)
print(model.score(x_test, y_test))

0.4937016357183619


In [None]:
m = dict(zip(train.org_id, train.rating_org))

# Forecast
test = pd.read_csv('data/test_x.csv')

mean = test.average_bill.mean()
test.average_bill.fillna(mean, inplace=True)
train.fillna(method='ffill', inplace=True)

test.rubrics = test.rubrics.apply(lambda it: rubric_mapping[it])
test.user_city = test.user_city.apply(lambda it: city_mapping[it])
test.org_city = test.org_city.apply(lambda it: city_mapping[it])
x = test[columns]
x = StandardScaler().fit_transform(x)

predicted = model.predict(x)
for i in range(len(predicted)):
    if test.loc(i)['org_id'] in m:
        predicted[i] = m[test.loc(i).org_id]
res = pd.DataFrame(data=predicted, columns=['rating_org'])
res.index.name = 'id'

res.to_csv('res.csv')

In [69]:
for i in range(len(predicted)):
    org_id = test.loc[[i]]['org_id'].values[0]
    if org_id in m:
        predicted[i] = m[org_id]
res = pd.DataFrame(data=predicted, columns=['rating_org'])
res.index.name = 'id'

res.to_csv('res1.csv')

In [53]:
for i in range(5, 151, 10):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    model = KNeighborsRegressor(n_neighbors=i, weights='distance', p=1)
    model.fit(x_train, y_train)
    print(i, model.score(x_test, y_test))

5 0.43311401676652095
15 0.46408270446943334
25 0.4506061212105025
35 0.41840321766230515
45 0.42401401520148285
55 0.4064378280519142
65 0.40769890213555604
75 0.4162430132728524


KeyboardInterrupt: 

In [31]:
import statistics

cor = []
for (columnName, columnData) in train.items():
    coef = statistics.correlation(columnData, train.rating_org)
    cor.append((abs(coef), columnName))
cor.sort(reverse=True)
print(*cor, sep='\n')
sortedColumns = list(map(lambda it: it[1], cor))[1:]
sortedColumns.remove('org_id')
sortedColumns.remove('user_id')
print(sortedColumns)

(1.0, 'rating_org')
(0.26938566457672053, 'rating')
(0.18106242489469343, 'cafe')
(0.17395829887074368, 'toilet_for_disabled')
(0.1671703649173784, 'parking_disabled')
(0.15441921200341907, 'payment_by_credit_card')
(0.1509087031367268, 'automatic_door')
(0.12164157745155518, 'wheelchair_access')
(0.11900108868728286, 'car_park')
(0.11885634572520054, 'craft_beer')
(0.11683452627838627, 'gift_certificate')
(0.10504731442044699, 'food_delivery')
(0.09534192220789597, 'sports_broadcasts')
(0.08717623826293669, 'privilege_tickets')
(0.08305640925384583, 'projector')
(0.0774971120122276, 'wi_fi')
(0.07450391589699248, 'food_product_delivery')
(0.07241811167265204, 'karaoke')
(0.07233240804877296, 'free_delivery')
(0.0691389788971624, 'farm_products')
(0.06855454481107333, 'self_service_kiosks')
(0.06783362050587999, 'organic_food')
(0.06589007226339627, 'minimum_order')
(0.0621485716191968, 'table_games')
(0.06062042112063336, 'promotions')
(0.05883952998608158, 'takeaway')
(0.057154638325

In [30]:
train = pd.read_csv('data/train.csv')

rubric_mapping = dict(
    zip(
        pd.unique(train.rubrics),
        np.arange(0, len(pd.unique(train.rubrics)))
    )
)
train.rubrics = train.rubrics.apply(lambda it: rubric_mapping[it])

city_mapping = {'msk': 0, 'spb': 1}
train.user_city = train.user_city.apply(lambda it: city_mapping[it])
train.org_city = train.org_city.apply(lambda it: city_mapping[it])

mean = train.average_bill.mean()
train.average_bill.fillna(mean, inplace=True)



y = np.array(train.rating_org)

model = KNeighborsRegressor(n_neighbors=20, weights='uniform', p=2)

train[train.columns] = StandardScaler().fit_transform(train[train.columns])

for i in range(25, len(sortedColumns)):
    columns = sortedColumns[:i]
    x = train[columns]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    model.fit(x_train, y_train)
    print(i, model.score(x_test, y_test), columns[-1])

25 0.26990479412479895 takeaway
26 0.24943593784338836 elevator_wheelchair_accessible
27 0.2751223668953866 average_bill
28 0.293816202948445 music
29 0.2934558577552073 vip_zone
30 0.2895926007839339 delivery
31 0.30801263606268336 face_control
32 0.2866205833778832 dancefloor
33 0.2870881741957487 show_program
34 0.29277082059913895 ts
35 0.29803516486632575 rubrics
36 0.2997646450946342 food_court1
37 0.30547016397408544 dress_code
38 0.32821640315082534 coffee_to_go
39 0.31990584915973763 chillout
40 0.3193461891461876 air_conditioning
41 0.31177831734288475 elevator
42 0.3295858611285667 internet access
43 0.3109171586592244 strip
44 0.3249057688336814 wheelchair_accessible
45 0.3200984719921184 handmade_goods
46 0.33107285740998926 pickup
47 0.3278542153522318 has_bar
48 0.3254866349759815 hall_capacity
49 0.33102826993091927 around_the_clock_work1
50 0.3299762063195736 call_button
51 0.3268173271680861 summer_terrace
52 0.3302351668271608 ramp
53 0.31560160047309505 teahouse
54 

## Как это работает?

Выбираем точку u и подбираем для неё категорию тета.

![title](img.png)

In [32]:
train = pd.read_csv('data/train.csv')
train.drop(['user_id', 'org_id'], axis=1, inplace=True)
train.fillna(method='ffill', inplace=True)

ohe = OneHotEncoder()
transformedRubrics = ohe.fit_transform(train[['rubrics']])
rubricsCategories = ohe.categories_[0]
train[rubricsCategories] = transformedRubrics.toarray()

ohe = OneHotEncoder()
transformedOrgCity = ohe.fit_transform(train[['org_city']])
orgCityCategories = ohe.categories_[0]
train[orgCityCategories] = transformedOrgCity.toarray()

columns = ['rating', 'average_bill', 'payment_by_credit_card', 'wi_fi', 'cafe', 'has_bar', 'internet access', 'music', 'delivery', 'kalyan']
columns += rubricsCategories.tolist()
columns += orgCityCategories.tolist()
x = train[columns]

y = np.array(train.rating_org)

model = KNeighborsRegressor(n_neighbors=100, weights='uniform', p=2)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
model.fit(x_train, y_train)

print(model.score(x_test, y_test))


test = pd.read_csv('data/test_x.csv')
test.drop(['user_id', 'org_id'], axis=1, inplace=True)
test.fillna(method='ffill', inplace=True)

ohe = OneHotEncoder()
transformedRubrics = ohe.fit_transform(test[['rubrics']])
rubricsCategories = ohe.categories_[0]
test[rubricsCategories] = transformedRubrics.toarray()

ohe = OneHotEncoder()
transformedOrgCity = ohe.fit_transform(test[['org_city']])
orgCityCategories = ohe.categories_[0]
test[orgCityCategories] = transformedOrgCity.toarray()

test = test[columns]

predicted = model.predict(test)
res = pd.DataFrame(data=predicted, columns=['rating_org'])

res.index.name = 'id'
res.to_csv('res.csv')

0.18302552217094348
