In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:

from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor

In [3]:
from google.colab import drive
drive.mount('/content/drive/')


RAND = sum(ord(x) for x in 'NEVER SURRENDER')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Смена BERT на TF-IDF

В связи с загруженностью удаленного компьютера было принято решение убрать BERT и использовать TF-IDF. 



In [4]:
df_train = pd.read_csv('/content/drive/MyDrive/train.csv')

In [5]:
df_valid =  pd.read_csv('/content/drive/MyDrive/valid.csv')

In [6]:
df_train.columns = df_train.columns.str.replace('(.)([A-Z])',r'\1_\2', regex=True).str.lower()

In [7]:
df_valid.columns = df_valid.columns.str.replace('(.)([A-Z])',r'\1_\2', regex=True).str.lower()

In [8]:
df_train.shape

(218491, 11)

**Классы для подготовки данных**

In [9]:
class LocationCategory(BaseEstimator, TransformerMixin):

    def __init__(self, name_of_column, number_of_category):
        self.location_dict = None
        self.name_of_column = name_of_column
        self.number_of_category = number_of_category

    def fit(self, features, y=None):
        self.location_dict = self.create_dict(features[self.name_of_column])
        return self

    def transform(self, features):
        feature = features.copy()
        feature[self.name_of_column] = feature[self.name_of_column].apply(lambda x: self.location_dict.get(x, -1))
        return feature

    def create_dict(self, feature):

        # create groups by value counts
        groups = feature.value_counts()
        # split by ~equals sum per group
        group_sum = round(groups.sum() / self.number_of_category)

        current_sum = 0
        group_num = 0
        group_dict = {}

        for index, itm in zip(groups.index, groups.to_numpy()):
            if (current_sum + itm) > group_sum:
                if abs(current_sum + itm - group_sum) < abs(current_sum - group_sum):
                    # include current itm in group
                    current_sum = 0
                    rem_sum = groups.loc[index:].sum() - itm
                    group_dict[index] = group_num
                    group_num += 1
                    if group_num == (self.number_of_category - 1):
                        break
                else:
                    # exclude current itm
                    group_num += 1
                    rem_sum = groups.loc[index:].sum()
                    if group_num == (self.number_of_category - 1):
                        break
                    current_sum = itm
                    group_dict[index] = group_num

                # refresh group sum
                group_sum = round(rem_sum / (self.number_of_category - group_num))

            else:
                current_sum += itm
                group_dict[index] = group_num
        return group_dict


# pd.Series
class TextClear(BaseEstimator, TransformerMixin):

    def fit(self, feature, y=None):
        return self

    def transform(self, feature, y=None):
        feature = feature.str.replace(r'((https?:\/\/)|w{3}).*?( |$)', ' ', regex=True)
        feature = feature.str.replace(r'[^A-Za-z\']', ' ', regex=True).str.lower().str.strip()
        feature = feature.str.replace(r'\W{2,}', ' ', regex=True)
        return feature


# pd.Series
class TextLemma(BaseEstimator, TransformerMixin):
    # spacy
    def __init__(self, nlp):
        self.nlp = nlp

    def fit(self, feature, y=None):
        return self

    def transform(self, feature, y=None):
        return feature.apply(lambda row: ' '.join([w.lemma_ for w in self.nlp(row) if not w.is_stop]))


class WordsNumber(BaseEstimator, TransformerMixin):

    def __init__(self, name_of_column):
        self.name_of_column = name_of_column
        
    def fit(self, features, y=None):
        return self

    def transform(self, features, y=None):
        data = features.copy()
        data['word_num'] = data[self.name_of_column].str.count(' ')
        return data


### Часть 1, OneHotEncoder для линейных моделей

In [10]:
categorical_features = ['location_normalized', 'category']
categorical_pipeline = Pipeline([
    ('loc_category', LocationCategory('location_normalized', 10)),
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
])

In [11]:
text_features = 'full_description'
text_pipeline = Pipeline([('clear', TextClear()),
                         # ('lemma', TextLemma(var.nlp)),
                         ('tfidf', TfidfVectorizer(stop_words='english', min_df=5))
                         ])

In [12]:
col_transform = make_column_transformer((text_pipeline, text_features),
                                       (categorical_pipeline, categorical_features),
                                       (StandardScaler(), ['word_num'])
                                      )


In [13]:
preprocessor = Pipeline([('add_num', WordsNumber('full_description')),
              ('columns', col_transform)])

In [14]:
preprocessor

Pipeline(steps=[('add_num', WordsNumber(name_of_column='full_description')),
                ('columns',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('clear',
                                                                   TextClear()),
                                                                  ('tfidf',
                                                                   TfidfVectorizer(min_df=5,
                                                                                   stop_words='english'))]),
                                                  'full_description'),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('loc_category',
                                                                   LocationCategory(name_of_column='location_normalized',
                                                        

In [15]:
features_ohe = preprocessor.fit_transform(df_train)
target_ohe = df_train['salary_normalized']
features_ohe

<218491x40294 sparse matrix of type '<class 'numpy.float64'>'
	with 23374282 stored elements in Compressed Sparse Row format>

In [16]:
valid_f_ohe = preprocessor.transform(df_valid)
valid_t =  df_valid['salary_normalized']
valid_f_ohe

<24277x40294 sparse matrix of type '<class 'numpy.float64'>'
	with 2592836 stored elements in Compressed Sparse Row format>

Ранее было выявлено, что распределение целевого не нормальное. Буду проверять модели на "оригинальном" целевом и с логарифмом.


In [17]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
def metrics(real, pred):
  print(f'RMSE = {mean_squared_error(real, pred) **.5}')
  print(f'R2 = {r2_score(real, pred)}')
  print(f'MAE = {mean_absolute_error(real, pred)}')

In [18]:
def cross_linear(model_, type_):
  if type_ == 'log':
    targe = np.log(target_ohe)
  else:
    targe = target_ohe
  m = model_.fit(features_ohe, targe)
  pred = m.predict(valid_f_ohe)
  if type_ == 'log':
    pred = np.exp(pred)
  metrics(valid_t, pred)
  return pred

In [32]:
ridge_pred = cross_linear(Ridge(alpha=0.5, random_state=RAND), '')

RMSE = 10231.003286782425
R2 = 0.6565287473036264
MAE = 7181.309422791774


In [28]:
log_ridge_pred = cross_linear(Ridge(random_state=RAND), 'log')

RMSE = 10383.609985944573
R2 = 0.6462058231515058
MAE = 6776.318275768737


In [24]:
%%time
sgd_pred = cross_linear(SGDRegressor(random_state=RAND), '')

RMSE = 10903.221784725252
R2 = 0.6099110590855586
MAE = 7608.611081839651
CPU times: user 2min 40s, sys: 250 ms, total: 2min 40s
Wall time: 2min 58s


In [None]:
log_sgd_pred = cross_linear(SGDRegressor(random_state=RAND), 'log')

In [None]:
linear_pred = cross_linear(LinearRegression(), '')

RMSE = 11029.3804397552
R2 = 0.6008315752823272
MAE = 7624.284351785027


In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso_pred = cross_linear(Lasso(random_state=RAND), '')

### Часть 2, OrdinalEncoder для моделей на осневе деревьев




In [None]:
categorical_pipeline_ord = Pipeline([
    ('loc_category', LocationCategory('location_normalized', 3)),
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
])
text_pipeline_ord = Pipeline([('clear', TextClear()),
                         # ('lemma', TextLemma(var.nlp)),
                         ('tfidf', TfidfVectorizer(stop_words='english', min_df=5)),
                         ('PCA', TruncatedSVD(n_components=800,  random_state=RAND))
                         ])

In [None]:
preprosessor_ord = make_column_transformer((text_pipeline_ord, text_features),
                                       (categorical_pipeline_ord, categorical_features))

In [None]:
features_ord = preprosessor_ord.fit_transform(df_train)
target_ord = df_train['salary_normalized']

In [None]:
valid_f_ord =  preprosessor_ord.transform(df_valid)

Несколько наборов параметров были отобраны на небольшой выборке.

In [None]:
def cross_linear_o(model_):
  m = model_.fit(features_ord[0:80_000], target_ord[0:80_000])
  pred = m.predict(valid_f_ord)
  metrics(valid_t, pred)
  return pred

In [None]:
ert_model = ExtraTreesRegressor(random_state=RAND, max_depth=None, bootstrap=True, n_estimators = 50)

In [None]:
gbr_model = GradientBoostingRegressor(random_state=RAND, n_estimators=50)

In [None]:
%%time
gbr_pred = cross_linear_o(gbr_model)

RMSE = 13418.828958871363
R2 = 0.4091419182528737
MAE = 9516.551299433126
CPU times: user 6min 10s, sys: 483 ms, total: 6min 11s
Wall time: 6min 23s


In [None]:
%%time
ert_pred = cross_linear_o(ert_model)

RMSE = 11928.522879273647
R2 = 0.5330963366531114
MAE = 8137.258840984294
CPU times: user 12min 35s, sys: 711 ms, total: 12min 36s
Wall time: 12min 33s
