# Модель предсказания ключевых фраз

!!! Осторожно, BigData

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

import joblib

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

RANDOM_STATE = 2

## Исходные данные

In [None]:
listing = pd.read_csv('train.csv')
topics = pd.read_csv('topics.csv')
comments = pd.read_csv('comments.csv')

In [None]:
train = listing.merge(topics[['listing_id']], left_on='id', right_on='listing_id', how='inner')
train.drop(columns=['listing_id'], inplace=True)

In [None]:
target = topics.merge(train[['id']], left_on='listing_id', right_on='id', how='inner').astype(int)
target.drop(columns=['id'], inplace=True)

Вспомогательные функции

In [None]:
def get_from_train_by_index(i):
    return train[train.index == i].to_dict(orient='records')[0]

def get_from_train_by_id(idx):
    return train[train.id == idx].to_dict(orient='records')[0]

def get_reviews_by_index(i):
    idx = train[train.index == i].id.values[0]
    return comments.loc[comments.listing_id == idx, :]

def get_reviews_by_id(idx):
    return comments.loc[comments.listing_id == idx, :]


## Классификатор объектов недвижимости по отзывам

In [None]:
class ThePropertyPhrasesClassifier(object):
    def __init__(self):
        self.mlb = joblib.load("mlb_dump.pkl")
        self.pca = joblib.load("pca_dump.pkl")
        self.ovrc = joblib.load("ovrc_dump.pkl")
        self.phrases = pd.read_csv('phrases.csv', index_col='id')
        
        self.num_cols = ['latitude',
                         'longitude',
                         'accommodates',
                         'bathrooms',
                         'bedrooms',
                         'beds',
                         'square_feet',
                         'security_deposit',
                         'cleaning_fee',
                         'guests_included',
                         'extra_people',
                         'minimum_nights',
                         'price']
        
        self.cat_cols = ['experiences_offered',
                         'host_response_time',
                         'host_is_superhost',
                         'host_has_profile_pic',
                         'host_identity_verified',
                         'neighbourhood_cleansed',
                         'is_location_exact',
                         'property_type',
                         'room_type',
                         'bed_type',
                         'cancellation_policy',
                         'require_guest_phone_verification']
        
        self.tresholds = [0.3, 0.2, 0.1, 0.3, 0.2, 0.3, 0.2, 0.2, 0.2, 0.1, 0.2, 0.1, 0.2,
                          0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.2, 0.2, 0.2, 0.2,
                          0.2, 0.2, 0.2, 0.2, 0.2, 0.2]
        
    def get_feature_names(self):
        return ['experiences_offered',
                 'host_response_time',
                 'host_is_superhost',
                 'host_has_profile_pic',
                 'host_identity_verified',
                 'neighbourhood_cleansed',
                 'latitude',
                 'longitude',
                 'is_location_exact',
                 'property_type',
                 'room_type',
                 'accommodates',
                 'bathrooms',
                 'bedrooms',
                 'beds',
                 'bed_type',
                 'amenities',
                 'square_feet',
                 'security_deposit',
                 'cleaning_fee',
                 'guests_included',
                 'extra_people',
                 'minimum_nights',
                 'cancellation_policy',
                 'require_guest_phone_verification',
                 'price']
        
        
    def preprocess(self, df):
            df_amenities = df['amenities'] \
                .apply(lambda x: str(x).strip('{}')) \
                .apply(lambda x: str(x).replace('to shower, toilet', 'to shower and toilet')) \
                .apply(lambda x: str(x).split(','))
            
            df_amenities = pd.DataFrame(self.mlb.transform(df_amenities),
                                        columns=self.mlb.classes_)
            
            df_pca = self.pca.transform(df_amenities)
            pca_cols = [f'pca{i:02}' for i in range(df_pca.shape[1])]
            i = 0
            
            df = df[self.num_cols + self.cat_cols].copy()
            for col in pca_cols:
                df[col] = df_pca[:,i]
                i += 1

            for col in self.cat_cols:
                le = LabelEncoder()
                le.fit(train[col].fillna(''))
                df[col] = le.transform(df[col].fillna(''))

            df.fillna(0, inplace=True)
            df['price'] = np.log1p(df['price'])

            return df

        
    def predict(self, df):
        pred = self.ovrc.predict_proba(df) 
        for n in range(pred.shape[1]):
            pred[:, n] = pred[:, n] > self.tresholds[n]
        return pred


clf = ThePropertyPhrasesClassifier()

### Тест классификатора

In [None]:
X = clf.preprocess(train)
y = target.drop(columns=['listing_id'])
_, X_test, _, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)
X_test.shape, y_test.shape

In [None]:
pred = clf.predict(X_test)
f1_score(y_test, pred, average='samples')

## Модель предсказания ключевых фраз по отзывам

In [None]:
class ThePropertyPhrasesGenerator(object):
    def __init__(self):
        self.topics_count = 32
        self.clf = ThePropertyPhrasesClassifier()
        self.phrases = pd.read_csv('phrases.csv', index_col='id')

        
    def get_feature_names(self):
        return self.clf.get_feature_names()
    
    
    def generate_key_phrases(self, data, n_phrases=20):
        cols = self.get_feature_names()
        ds = pd.Series([np.nan] * len(cols), index=cols)
        
        for col in cols:
            if col in data.keys():
                ds[col] = data[col]

        df = self.clf.preprocess(ds.to_frame().T)
        
        pred = self.clf.predict(df)
        
        topics = pd.Series([np.nan] * self.topics_count).to_frame()
        for i in range(self.topics_count):
            if pred[0,i] == 1:
                topics.loc[i,0] = i
        #return pred[0].T
        topics.rename(columns={0: 'topic'}, inplace=True)
        topics.dropna(inplace=True)
        topics['topic'] = topics['topic'].astype(int)
        
        phrases = topics.merge(self.phrases, how='inner', on='topic')
        if phrases.shape[0] == 0:
            phrases = self.phrases.copy()
        else:
            phrases.drop(index=phrases[phrases.topic==0].index, inplace=True)

        return phrases \
            .sort_values(by=['freq','rented_mean','listing_count'],
                         ascending=[False,False,True]) \
            .head(n_phrases)[['topic','phrases','freq','listing_count','rented_mean']]

    
phrases_generator = ThePropertyPhrasesGenerator()


### Тест модели

In [None]:
rec_index = 6000

In [None]:
d = get_from_train_by_index(rec_index)
phrases = phrases_generator.generate_key_phrases(d)
phrases = phrases.reset_index().drop(columns=['index'], axis=1)

reviews = get_reviews_by_index(rec_index)
reviews = reviews.reset_index().drop(columns=['index'], axis=1)

columns = list(phrases.columns) + list(reviews.columns)

pd.concat([phrases, reviews], axis=1, ignore_index=True) \
    .rename(columns=dict(zip(range(len(columns)), columns))) \
    .fillna('')