In [1]:
import pandas as pd
from pathlib import Path
import sys
import numpy as np
from numba import jit
import multiprocessing as mp

PATH = '/Users/danil/Documents/github/jooble'
sys.path.append(str(PATH))

In [2]:
from src.jooble import prepare_csv, preprocess

# used functions

In [None]:
def extract_features(df):
    parsed_features = df['features'].str.split(',',
                                               expand=True).astype(np.int16)
    parsed_features.insert(0, 'id_job', df['id_job'])
    return parsed_features

In [None]:
def read_and_clean(path, separator="\t", chunksize=1000):

    batch_reader = pd.read_csv(filepath_or_buffer=path,
                               sep=separator,
                               chunksize=chunksize,
                               engine='c')

    with mp.Pool(mp.cpu_count()) as pool:
        result = pool.map(extract_features, batch_reader)
        data = pd.concat(result)

    return data

In [None]:
@jit(nopython=True)
def fit_z_score_normalizator(X, features_num):
    X_t = X.T
    means = []
    sigmas = []
    for i in range(features_num):
        means.append(np.mean(X_t[i]))
        sigmas.append(np.std(X_t[i]))
    return means, sigmas


@jit(nopython=True)
def fit_min_max_scaler(X, features_num):
    X_t = X.T
    min_v = []
    max_v = []
    for i in range(features_num):
        min_v.append(np.min(X_t[i]))
        max_v.append(np.max(X_t[i]))
    return min_v, max_v


class preprocess:
    def __init__(self, normalization='z-score'):

        self.normalization = normalization

    def get_features_num(self, X):

        self.factor = X[0].value_counts().index[0]
        if self.factor == 2:
            self.features_num = 256
        # in case of selecting special dtype, further we can select only specific column type
        # self.dtypes = int
        # here wa can add new factor check for features selection or checks
        # or redefine normalization
        # self.normalization = 'z-score'

    def fit_scaler(self, X):
        X = X.iloc[:, 2:2 + self.features_num].values

        if self.normalization == 'z-score':
            self.mean, self.sigma = fit_z_score_normalizator(
                X, self.features_num)
        elif self.normalization == 'min-max-scaler':
            self.min, self.max = fit_min_max_scaler(X, self.features_num)
        return

    def transform_with_scaler(self, X):

        X_to_transform = X.iloc[:, 2:2 + self.
                                features_num]  #.select_dtypes(include = self.dtypes)

        if self.normalization == 'z-score':
            X_to_transform = (X_to_transform - self.mean) / self.sigma

        elif self.normalization == 'min-max-scaler':
            X_to_transform = (X_to_transform - self.min) / (self.max -
                                                            self.min)

        X.iloc[:, 2:2 + self.features_num] = X_to_transform

        return X

    def add_max_feature_index(self, X):

        X['max_feature_2_index'] = X.iloc[:, 2:2 +
                                          self.features_num].values.argmax(
                                              axis=1)

        return X

    def add_max_feature_2_abs_mean_diff(self, X):

        max_values = X.iloc[:, 2:2 + self.features_num].values.max(axis=1)
        mean_values = X.iloc[:, 2:2 + self.features_num].values.mean(axis=0)

        if 'max_feature_2_index' in X.columns:
            indexes = X['max_feature_2_index']
        else:
            indexes = self.add_max_feature_index(X)['max_feature_2_index']

        X['max_feature_2_abs_mean_diff'] = np.abs(max_values -
                                                  mean_values[indexes])

        return X

# open data

In [3]:
train_features = prepare_csv(path = PATH + '/data/raw/train.tsv')

In [4]:
prep = preprocess()

prep.get_features_num(train_features)
prep.fit_scaler(train_features)

In [5]:
test_features = prepare_csv(PATH + '/data/raw/test.tsv')

test_features = prep.add_max_feature_index(test_features)

test_features = prep.add_max_feature_2_abs_mean_diff(test_features)

test_features = prep.transform_with_scaler(test_features)

test_features = test_features.rename(columns={i: f'feature_2_stand_{i}' for i in range(1, 1 + prep.features_num)})

test_features = test_features.drop(columns=[0])

test_features.head()

Unnamed: 0,id_job,feature_2_stand_1,feature_2_stand_2,feature_2_stand_3,feature_2_stand_4,feature_2_stand_5,feature_2_stand_6,feature_2_stand_7,feature_2_stand_8,feature_2_stand_9,...,feature_2_stand_249,feature_2_stand_250,feature_2_stand_251,feature_2_stand_252,feature_2_stand_253,feature_2_stand_254,feature_2_stand_255,feature_2_stand_256,max_feature_2_index,max_feature_2_abs_mean_diff
0,-9168029089769934451,0.769135,0.234307,0.720399,0.696881,-0.576915,0.80313,0.258285,0.706847,0.808236,...,0.829606,0.772527,0.120503,0.802841,-0.50201,0.184573,0.79824,0.731307,161,154.17
1,-9167993139315005259,-0.496788,0.234307,-0.162567,0.092944,0.584295,-0.39882,0.294103,0.030519,-0.419534,...,-0.189921,0.021303,0.438698,0.20112,0.419363,0.468833,-0.005895,-0.275757,161,154.17
2,-9167993136660569470,-0.502214,0.234307,-0.447395,-0.09139,0.419332,-0.13172,0.272612,-0.22539,-0.333323,...,-0.358414,-0.323721,0.379037,-0.028107,0.436516,0.305383,-0.277769,-0.890851,161,154.17
3,-9167993126042826314,-0.688485,0.232522,-0.874636,-0.564352,0.54548,-0.622516,0.458866,-0.758096,-0.814918,...,-0.895308,-0.703004,0.546657,-1.03416,0.480624,0.497259,-0.890443,-1.017487,161,154.17
4,-9167914043308884846,-1.004966,-0.204754,-1.263901,-1.144035,0.053826,-0.919665,0.532891,-1.72428,-1.575957,...,-1.035243,-1.767441,0.421652,-1.505349,0.284588,0.520947,-1.380583,-0.348121,203,21.32


In [10]:
test_features.head()

Unnamed: 0,id_job,feature_2_stand_1,feature_2_stand_2,feature_2_stand_3,feature_2_stand_4,feature_2_stand_5,feature_2_stand_6,feature_2_stand_7,feature_2_stand_8,feature_2_stand_9,...,feature_2_stand_249,feature_2_stand_250,feature_2_stand_251,feature_2_stand_252,feature_2_stand_253,feature_2_stand_254,feature_2_stand_255,feature_2_stand_256,max_feature_2_index,max_feature_2_abs_mean_diff
0,-9168029089769934451,0.769135,0.234307,0.720399,0.696881,-0.576915,0.80313,0.258285,0.706847,0.808236,...,0.829606,0.772527,0.120503,0.802841,-0.50201,0.184573,0.79824,0.731307,161,154.17
1,-9167993139315005259,-0.496788,0.234307,-0.162567,0.092944,0.584295,-0.39882,0.294103,0.030519,-0.419534,...,-0.189921,0.021303,0.438698,0.20112,0.419363,0.468833,-0.005895,-0.275757,161,154.17
2,-9167993136660569470,-0.502214,0.234307,-0.447395,-0.09139,0.419332,-0.13172,0.272612,-0.22539,-0.333323,...,-0.358414,-0.323721,0.379037,-0.028107,0.436516,0.305383,-0.277769,-0.890851,161,154.17
3,-9167993126042826314,-0.688485,0.232522,-0.874636,-0.564352,0.54548,-0.622516,0.458866,-0.758096,-0.814918,...,-0.895308,-0.703004,0.546657,-1.03416,0.480624,0.497259,-0.890443,-1.017487,161,154.17
4,-9167914043308884846,-1.004966,-0.204754,-1.263901,-1.144035,0.053826,-0.919665,0.532891,-1.72428,-1.575957,...,-1.035243,-1.767441,0.421652,-1.505349,0.284588,0.520947,-1.380583,-0.348121,203,21.32


# save

In [287]:
test_feautres.to_csv(PATH + '/data/processed/test_proc.tsv', sep ='\t', index = False)

In [288]:
pd.read_csv(PATH + '/data/processed/test_proc.tsv', sep ='\t')

Unnamed: 0,id_job,feature_2_stand_1,feature_2_stand_2,feature_2_stand_3,feature_2_stand_4,feature_2_stand_5,feature_2_stand_6,feature_2_stand_7,feature_2_stand_8,feature_2_stand_9,...,feature_2_stand_249,feature_2_stand_250,feature_2_stand_251,feature_2_stand_252,feature_2_stand_253,feature_2_stand_254,feature_2_stand_255,feature_2_stand_256,max_feature_2_index,max_feature_2_abs_mean_diff
0,-9168029089769934451,0.768653,0.234160,0.719948,0.696445,-0.576554,0.802627,0.258123,0.706405,0.807730,...,0.829087,0.772043,0.120428,0.802338,-0.501696,0.184457,0.797741,0.730849,161,154.17
1,-9167993139315005259,-0.496477,0.234160,-0.162465,0.092886,0.583929,-0.398570,0.293919,0.030500,-0.419272,...,-0.189802,0.021290,0.438424,0.200994,0.419100,0.468539,-0.005891,-0.275585,161,154.17
2,-9167993136660569470,-0.501899,0.234160,-0.447115,-0.091333,0.419070,-0.131637,0.272442,-0.225248,-0.333114,...,-0.358190,-0.323519,0.378799,-0.028090,0.436243,0.305192,-0.277595,-0.890293,161,154.17
3,-9167993126042826314,-0.688054,0.232376,-0.874089,-0.563999,0.545139,-0.622126,0.458579,-0.757622,-0.814408,...,-0.894747,-0.702563,0.546315,-1.033512,0.480323,0.496947,-0.889886,-1.016850,161,154.17
4,-9167914043308884846,-1.004337,-0.204625,-1.263110,-1.143318,0.053792,-0.919089,0.532557,-1.723201,-1.574970,...,-1.034594,-1.766334,0.421388,-1.504407,0.284409,0.520621,-1.379719,-0.347903,203,21.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-9163569547596238325,-0.077177,0.234160,-0.263674,0.075918,0.344721,0.095256,0.324942,-0.467948,-0.229131,...,0.141266,-0.113210,0.520762,-0.231720,0.507262,0.433029,-0.059467,-0.143000,161,154.17
96,-9163525656344858443,-0.044645,0.232376,-0.655858,-0.445227,0.661510,-0.542046,0.532557,-1.905878,-1.159038,...,-1.091675,-0.223255,0.475334,-0.578527,0.529302,0.475641,-0.824830,-1.034930,161,154.17
97,-9163472054053566485,-1.315198,0.218107,-2.031664,-1.986846,0.066722,-3.651813,0.379829,-3.205495,-4.400342,...,-4.773373,-2.519533,0.106232,-1.848032,0.152167,0.437764,-1.942261,-0.576912,161,154.17
98,-9163470125725648647,0.849983,0.235944,0.799018,0.754619,0.173396,0.926084,0.513466,0.755989,0.944393,...,0.923269,0.825843,0.512244,0.865973,0.441141,0.653192,0.874277,0.875487,1,172.12
