_______________________________________________________
# Предсказание зарплаты по данным резюме
____________________________________________________

## Часть 4

**Загрузка и настройка работы сохраненных моделей**


In [1]:
#part 1 imports
import pandas as pd
import torch
import transformers as ppb
import spacy
import numpy as np
import pickle
from torch import nn

#create custom pipe
from sklearn.base import BaseEstimator, TransformerMixin

from catboost import CatBoostRegressor

In [2]:
#part 2 imports
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from catboost import Pool

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

______________

In [3]:
def metrics(real, pred):
    print(f'MSE = {mean_squared_error(real, pred)}')
    print(f'R2 = {r2_score(real, pred)}')
    print(f'MAE = {mean_absolute_error(real, pred)}')

In [7]:
def bert_transform(data: pd.Series, 
                   device: torch.device,
                   tokenizer: ppb.models,
                   model_bert: ppb.models,
                   batch_size: int = 0) -> pd.DataFrame:
    
    if not batch_size:
        batch_size = data.shape[0]
    
    b_token = data.apply((lambda x: tokenizer.encode(x, 
                                                     add_special_tokens=True,
                                                     truncation=True)))
    max_len = max(b_token.map(len))
    padded = np.array([i + [0]*(max_len-len(i)) for i in b_token.values])
    attention_mask = np.where(padded != 0, 1, 0)
    embeddings = []
    start_ = 0
    stop_ = batch_size
    for j in range(padded.shape[0] // batch_size):
        batch = torch.LongTensor(padded[start_:stop_]) 
        attention_mask_batch = torch.LongTensor(attention_mask[start_:stop_]) 
        with torch.no_grad():
            batch_embeddings = model_bert(batch, attention_mask=attention_mask_batch)
        embeddings.append(batch_embeddings[0][:,0,:])
        start_ += batch_size
        stop_ += batch_size
    bert_features = np.concatenate(embeddings)
    return pd.DataFrame(bert_features)

In [8]:
def text_transform(data: pd.DataFrame, nlp: spacy.lang, col: str = 'full_description') -> pd.DataFrame:
    #fist clear text part
    #http https www
    data[col] = data[col].str.replace(r'((https?:\/\/)|w{3}).*?( |$)',' ', regex=True)
    data[col] = data[col].str.replace(r'[^A-Za-z\']',' ', regex=True).str.lower().str.strip()
    data[col] = data[col].str.replace(r'\W{2,}',' ', regex=True)

    #lemma part
    data['clear_text'] = data[col].apply(lambda row: ' '.join([w.lemma_ for w in nlp(row) if not w.is_stop]))
    
    return data

In [5]:
device = torch.device("cpu")
nlp = spacy.load('en_core_web_sm', disable=['parser'])

model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, 
                                                    ppb.DistilBertTokenizer, 
                                                    'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model_bert = model_class.from_pretrained(pretrained_weights)

with open('data/location_dict.pkl', 'rb') as f:
    location_dict = pickle.load(f)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Подгрузка заранее оставленных тестовых)

In [9]:
df = pd.read_csv('data/Train_rev1.csv', index_col=[0])
df = df[-2000:].reset_index(drop='True')

In [10]:
X_test = df[['FullDescription','LocationNormalized','Category']]
y_test = df['SalaryNormalized']

In [11]:
# if another name of columns on input :)
X_test.columns = ['full_description', 'location_normalized', 'category']


In [12]:
temps = text_transform(X_test, nlp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].str.replace(r'((https?:\/\/)|w{3}).*?( |$)',' ', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].str.replace(r'[^A-Za-z\']',' ', regex=True).str.lower().str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].str.replac

In [13]:
#X_test = text_transform(X_test, nlp)
cat_test = temps

In [14]:
cat_features = ['full_description', 'location_normalized', 'category']

In [30]:
cat_model = CatBoostRegressor()
cat_model.load_model('models/full_catboost_model.bin')

<catboost.core.CatBoostRegressor at 0x7fd0ed11a200>

In [31]:
cat = X_test[cat_features].reset_index(drop=True)

In [36]:
cat_model.score(Pool(cat_test[cat_features], 
                     y_test, 
                     cat_features=cat_features, 
                     text_features=['full_description']))

0.8304699179320459

In [37]:
pred = cat_model.predict(cat)

In [38]:
metrics(pred, y_test)

MSE = 37347248.10111603
R2 = 0.7781866173245335
MAE = 3986.1446843133504


__________________________

In [497]:
# Что-то мне в процессе не понравилось делать отдельный шаг для трансформации категорий, без пайплайна
# ААААА ЗАЧЕМ Я СЮДА ПОЛЕЗЛА, А?!?!?! Ну теперь принцип знаю, хоть и основы.
# Кривой новый класс для пайплайна. Работает даже. 
# Результаты старой версии и новой сравнила.

class location_category(BaseEstimator, TransformerMixin):
    
    def __init__(self, location_dict):
        #load dict
        self.location_dict = location_dict
        

    def fit(self, X, y=None):
        #no need to fit, only transform
        return self

    def transform(self, X):
        return pd.DataFrame(X.apply(lambda x: location_dict.get(x,'few')))

In [498]:
#load dict
with open('data/location_dict.pkl', 'rb') as f:
    location_dict = pickle.load(f)

In [499]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

location_transform = ColumnTransformer([('loc', location_category(location_dict), 'location_normalized')], 
                                       remainder='passthrough')

final_pipe = Pipeline([
    ('loc', location_transform),
    ('prep', cat_pipe)
   
])

In [501]:
learned_pipe = final_pipe.fit(df_pipe)
with open("models/learned_pipe_test.pkl", "wb") as f:
        pickle.dump(learned_pipe, f)

In [502]:
pd.DataFrame(learned_pipe.transform(X_test))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


Нейро не протестить на локальном нормально. Но там в колабе сойдет.