# Выделение ключевых фраз

!!! Осторожно, BigData

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

RANDOM_STATE = 2

## Исходные данные
Подборку ключевых фраз строим на основе коментариев, отобранных после предварительной обработки

In [2]:
comments = pd.read_csv('comments.csv')
comments.sort_values(by='comments')

Unnamed: 0,id,listing_id,comments
298422,1054528,23727374,#super cool location #clean #cool #design flat...
202328,290537,4408926,* best airbnb apartment * best location * best...
263666,940730,20348935,* great location * responsive host accepting l...
251775,988523,21508510,* ideal location for public transport as its l...
251064,410338,7171920,* please be aware house cleaning fee is exclud...
...,...,...,...
275969,478707,8922424,zvonko was a great host he helped me with all ...
335258,478706,8922424,zvonko was very nice and friendly i arrived in...
240146,961957,20912346,zwar einfache aber absolut ausreichende und or...
101450,575844,11968359,zweckm ssig eingerichtetes appartment mit blic...


## Выделение ключевых фраз из коментариев

В качестве ключевых фраз возьмем наиболее частые словосочетания из трех слов и объединим с наиболее частыми словосочетаниями из 4-6 слов

In [3]:
phrases3x_vectorizer = CountVectorizer(lowercase=False,
                                       stop_words='english',
                                       ngram_range=(3,3),
                                       min_df=100)

phrases3x_freqs = phrases3x_vectorizer.fit_transform(comments.comments)
phrases3x_vocab = pd.Series(phrases3x_vectorizer.vocabulary_)
phrases3x_freqs.shape, phrases3x_vocab.shape

((387026, 2945), (2945,))

In [4]:
phrases4x_vectorizer = CountVectorizer(lowercase=False,
                                       stop_words='english',
                                       ngram_range=(4,6),
                                       min_df=20)

phrases4x_freqs = phrases4x_vectorizer.fit_transform(comments.comments)
phrases4x_vocab = pd.Series(phrases4x_vectorizer.vocabulary_)
phrases4x_freqs.shape, phrases4x_vocab.shape

((387026, 6896), (6896,))

In [5]:
phrases_freqs = sp.sparse.hstack((phrases3x_freqs, phrases4x_freqs)).tocsc()
phrases_vocab = pd.concat([phrases3x_vocab, phrases4x_vocab + phrases3x_vocab.shape[0]], axis=0)
phrases_freqs.shape, phrases_vocab.shape

((387026, 9841), (9841,))

In [6]:
phrases_vocab.index.to_list()[::500]

['apartment really clean',
 'great communication easy',
 'host feel welcome',
 'flat just described',
 'long term stay',
 'bring ear plugs',
 'aparece en las fotos',
 'place perfect short stay',
 'appartement est id alement',
 'das zimmer war sehr',
 'highly recommended place stay',
 'est tr actif et',
 'london definitely stay thanks',
 'esta muy bien situado',
 'helpful host definitely stay',
 'te tr accueillante et',
 'check check really easy',
 'location couple minutes walk',
 'location great tube station',
 'clean modern great location']

Полученный словарь ключевых фраз выглядит довольно пригодным.

In [7]:
phrases = pd.DataFrame(data=phrases_freqs.sum(axis=0),
                       columns=phrases_vocab.index.to_list()).T \
    .reset_index() \
    .rename(columns={'index':'phrases', 0: 'freq'})

phrases.sort_values(by='freq')

Unnamed: 0,phrases,freq
6461,bus stops just corner,20
7043,best host ve met,20
7042,station just short walk away,20
7030,website hidden airbnb good,20
7025,id alement situ dans le,20
...,...,...
2679,clean exactly described,2549
1264,place stay nice,2591
2118,perfect location explore,2686
2250,absolutely loved staying,4442


In [8]:
phrases.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
phrases,9841,9841.0,close notting hill gate tube,1.0,,,,,,,
freq,9841,,,,98.96,174.516,20.0,25.0,41.0,120.0,5449.0


## Группировка ключевых фраз

В словаре много однотипных словосочетаний, сгруппируем похожие словосочетания по частоте встречаемости слов и выделим наиболее характерные кластеры.

In [9]:
words_vectorizer = TfidfVectorizer(lowercase=False)

words_freqs = words_vectorizer.fit_transform(phrases.phrases)
words_freqs.shape

(9841, 1467)

In [10]:
pd.DataFrame(data=words_freqs.sum(axis=0),
                       columns=words_vectorizer.get_feature_names()).T \
    .reset_index() \
    .rename(columns={'index':'words', 0: 'tfidf'})

Unnamed: 0,words,tfidf
0,10,120.802234
1,12,2.330202
2,15,51.869642
3,20,23.617903
4,30,8.873941
...,...,...
1462,zu,10.618537
1463,zufrieden,0.632432
1464,zuhause,0.629965
1465,zum,0.582420


Далее номер кластера используется как битовая маска, поэтому количество кластеров не должно превышать 64.

Выберем количество кластеров равным 32, что возможно обеспечит нам баланс 'качество' - 'сложность' для модели.

In [11]:
kmeans = KMeans(n_clusters=32, init='k-means++', n_init=20,
                random_state=RANDOM_STATE, n_jobs=-1, verbose=0)

words_classes = kmeans.fit_predict(words_freqs)
pd.Series(words_classes).value_counts()

0     1791
16     563
5      526
28     444
7      388
10     338
21     335
27     328
3      305
6      295
14     294
4      260
24     252
1      252
30     249
23     248
29     239
8      233
25     227
20     221
15     221
17     220
12     220
13     209
18     202
19     201
31     186
22     137
11     135
9      112
2      112
26      98
dtype: int64

В 0-вом кластере много выбросов, но вероятно есть и хорошие объекты.

In [12]:
phrases['topic'] = pd.Series(words_classes)
phrases.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
phrases,9841,9841.0,close notting hill gate tube,1.0,,,,,,,
freq,9841,,,,98.96,174.516,20.0,25.0,41.0,120.0,5449.0
topic,9841,,,,12.8213,10.1053,0.0,4.0,12.0,21.0,31.0


In [13]:
t = phrases['topic'].value_counts().idxmax()

print(f'Key phrases for largest topic "{t}"')
phrases[phrases.topic == t]['phrases']

Key phrases for largest topic "0"


4         natural history museum
9                 best airbnb ve
10              airbnb ve stayed
12             walk notting hill
13             notting hill gate
                  ...           
9800       ai mezzi di trasporto
9806             tout au long du
9824     shower plenty hot water
9837    stops right outside door
9839            tait pas pr sent
Name: phrases, Length: 1791, dtype: object

In [14]:
t = phrases['topic'].value_counts().idxmin()

print(f'Key phrases for smallest topic "{t}"')
phrases[phrases.topic == t]['phrases'][::5]

Key phrases for smallest topic "26"


22                 good restaurants bars
262           restaurants grocery stores
553               great restaurants bars
1177            plenty shops restaurants
1666             close great restaurants
1786               lots good restaurants
1983               lots restaurants bars
2345             bars restaurants nearby
2663            commerces et restaurants
2770               nice restaurants pubs
4497    shops restaurants grocery stores
5221    great location close restaurants
5566      station lots shops restaurants
6076      close shops restaurants public
6740        des commerces et restaurants
7161        close lots shops restaurants
7696       lots restaurants shops nearby
8658        short walk shops restaurants
9234     restaurants pubs grocery stores
9565        close shops restaurants tube
Name: phrases, dtype: object

## Формирование параметров релевантности
Вычисление релевантности для ключевых фраз будет непосредственно производиться на этапе обучения модели. \
Здесь произведем отбор параметров для вычисления релевантности.

Отметим, что фраза должна учитывать свойства объекта недвижимости (количественные, категориальные, географические).\
Для этого, произведем классификацию объектов недвижимости по тематикам ключевых фраз. 

Назначим каждой тематике битовую маску и будем использовать ее как индикатор после группироки коментариев по идентификатору аренды

In [15]:
topic_bitmask = phrases['topic'].map(lambda x: 1 << x)
np.sort(topic_bitmask.value_counts().index)

array([         1,          2,          4,          8,         16,
               32,         64,        128,        256,        512,
             1024,       2048,       4096,       8192,      16384,
            32768,      65536,     131072,     262144,     524288,
          1048576,    2097152,    4194304,    8388608,   16777216,
         33554432,   67108864,  134217728,  268435456,  536870912,
       1073741824, 2147483648], dtype=int64)

In [16]:
phrases_freqs[phrases_freqs > 0] = 1

phrase_bitmask = phrases_freqs * sp.sparse.diags(topic_bitmask)
phrase_bitmask

<387026x9841 sparse matrix of type '<class 'numpy.float64'>'
	with 970774 stored elements in Compressed Sparse Column format>

In [17]:
comment_bitmask = pd.DataFrame(phrase_bitmask.getcol(0).toarray(),dtype=np.int64).T.values

for i in range(1, phrase_bitmask.shape[1]):
    column_bitmask = pd.DataFrame(phrase_bitmask.getcol(i).toarray(),dtype=np.int64).T.values
    comment_bitmask = np.bitwise_or(comment_bitmask, column_bitmask)
    
comment_bitmask.shape

(1, 387026)

In [18]:
listing_bitmask = pd.DataFrame(comments.listing_id)
listing_bitmask['topic_mask'] = pd.Series(comment_bitmask[0])
listing_bitmask

Unnamed: 0,listing_id,topic_mask
0,24671810,32
1,15726550,75497577
2,26933941,0
3,10294813,1147929
4,23874390,2179080
...,...,...
387021,9236221,8486977
387022,15018694,1024
387023,2962765,0
387024,891874,1


Кроме количественных и качественных характеристик объектов недвижиости, \
хотелось бы чтоб учитывалась зависимость между фразами и востребованностью объекта на рынке.

Данные о востребованности возьмем из календаря и сгруппируем по объектам.

In [19]:
calendar = pd.read_csv('calendar.csv')
calendar['rented'] = calendar['available'].map({'t':0,'f':1})
calendar.head(2)

Unnamed: 0,listing_id,date,available,rented
0,9554,2019-08-18,t,0
1,97446,2019-11-04,f,1


In [20]:
listing_rented = calendar \
    .groupby('listing_id')[['rented']] \
    .sum()

listing_rented

Unnamed: 0_level_0,rented
listing_id,Unnamed: 1_level_1
9554,36
11076,365
13913,0
17402,4
24328,2
...,...
29797787,165
29797809,190
29797854,331
29797899,185


Объединим результаты группировки по тематикам и по востребованности

In [21]:
listing_topics = listing_bitmask \
    .groupby(by='listing_id') \
    .agg(topic_mask=('topic_mask', np.bitwise_or.reduce)) \
    .loc[:,'topic_mask'] \
    .apply(lambda x: pd.Series([((x & (1 << i)) >> i) for i in range(32)])) \
    .rename(columns=dict([(i,f'topic{i:02}') for i in range(32)])) \
    .merge(listing_rented, left_index=True, right_index= True) 

listing_topics

Unnamed: 0_level_0,topic00,topic01,topic02,topic03,topic04,topic05,topic06,topic07,topic08,topic09,...,topic23,topic24,topic25,topic26,topic27,topic28,topic29,topic30,topic31,rented
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9554,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,36
13913,1,0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
17402,1,1,1,1,0,1,1,1,1,0,...,1,1,1,0,1,1,1,1,1,4
24328,1,1,0,1,1,1,1,0,0,1,...,1,1,0,1,1,1,1,0,1,2
25023,1,1,1,1,0,1,0,0,1,0,...,1,0,0,0,0,0,1,1,1,325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29625356,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,269
29669399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,318
29675505,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,317
29681740,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


Сохраним результаты, они нам понадобятся для обучения модели

In [22]:
listing_topics.drop(columns=['rented'], inplace=False).to_csv('topics.csv')

Посчитаем несколько параметров для дальнейшего вычисления релевантности фраз

In [23]:
phrases['listing_count'] = 0
phrases['rented_mean'] = 0
phrases['rented_median'] = 0
phrases['rented_sum'] = 0

for i in range(0,32):
    mask = listing_topics[f'topic{i:02}'] == 1
    
    listing_count = listing_topics[f'topic{i:02}'][mask].count()
    phrases.loc[phrases['topic'] == i, 'listing_count'] = listing_count

    available_agg = listing_topics['rented'][mask].agg(['mean','median','sum'])
    phrases.loc[phrases['topic'] == i, 'rented_mean'] = available_agg['mean']
    phrases.loc[phrases['topic'] == i, 'rented_median'] = available_agg['median']
    phrases.loc[phrases['topic'] == i, 'rented_sum'] = available_agg['sum']
    

phrases

Unnamed: 0,phrases,freq,topic,listing_count,rented_mean,rented_median,rented_sum
0,apartment really clean,123,6,17618,220.141049,257.0,3878445.0
1,flat exactly described,358,31,13831,218.245752,250.0,3018557.0
2,flat perfect location,183,31,13831,218.245752,250.0,3018557.0
3,perfect location london,362,10,15435,219.725688,254.0,3391466.0
4,natural history museum,479,0,31340,228.165124,276.0,7150695.0
...,...,...,...,...,...,...,...
9836,convenient location minutes walk,55,14,12383,218.171849,248.0,2701622.0
9837,stops right outside door,37,0,31340,228.165124,276.0,7150695.0
9838,room big bed comfortable,45,25,14708,217.252448,250.0,3195349.0
9839,tait pas pr sent,50,0,31340,228.165124,276.0,7150695.0


In [24]:
phrases.describe()

Unnamed: 0,freq,topic,listing_count,rented_mean,rented_median,rented_sum
count,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0
mean,98.959963,12.821258,16829.846967,219.333758,253.851539,3729489.0
std,174.516272,10.105272,7752.068428,5.012244,12.775374,1814675.0
min,20.0,0.0,5524.0,207.141021,228.0,1144247.0
25%,25.0,4.0,10532.0,215.306474,243.0,2276170.0
50%,41.0,12.0,14708.0,218.245752,250.0,3195349.0
75%,120.0,21.0,18468.0,220.723785,259.0,4069524.0
max,5449.0,31.0,31340.0,228.165124,276.0,7150695.0


Сохраним параметры ключевых фраз

In [25]:
phrases.index.rename('id', inplace=True)
phrases.to_csv('phrases.csv')

In [26]:
'Done'

'Done'