# Notebook to preprocess the data

In [1]:
import json
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.append('../../configs/')
sys.path.append('../../')

In [3]:
# with this line I have access to original headers as lists
from utils import load_train_datasets, load_test_datasets, print_logs

In [4]:
path2train = "../../DataHackaton/train/"
path2test = "../../DataHackaton/test/"
path2target = "../targets/"
path2features = "../features/"

## Read the data

In [5]:
candidates_df, vacants_df, application_df, app_stages_df, stages_df,_ = load_train_datasets(path2train)
candidates_test_df, vacants_test_df, application_test_df, stages_test_df,_ = load_test_datasets(path2test)

videosize_df = pd.read_csv(f"{path2features}videosize_train.csv").set_index("id")
videosize_test_df = pd.read_csv(f"{path2features}videosize_test.csv").set_index("id")

  mask |= (ar1 == a)


In [6]:
train_profile_sentiment = pd.read_csv(f"{path2features}train_profile_sentiment.csv", sep=";").set_index("id")
test_profile_sentiment = pd.read_csv(f"{path2features}test_profile_sentiment.csv", sep=";").set_index("id")


In [7]:
train_profile_sentiment_fillna = pd.read_csv(f"{path2features}train_profile_sentiment_fillna.csv", sep=";").set_index("id")
test_profile_sentiment_fillna = pd.read_csv(f"{path2features}test_profile_sentiment_fillna.csv", sep=";").set_index("id")

# Creacion de la variable target

In [8]:
def check_afin(grupo):
    lista_status = grupo["stage_type"].values
    if (0 in lista_status) & (1 in lista_status):
        return 1
    else:
        return 0

In [9]:
def create_target(path_to_target):
    # path_to_target : path to save the target variable

    app_stage = app_stages_df.join(stages_df, on="stage_id")
    
    accepted_post_atr = app_stage[(app_stage["status"] == "accepted") & ((app_stage["stage_type"]==0) | (app_stage["stage_type"]==1))]
    
    afines = pd.DataFrame(accepted_post_atr.groupby("application_id")["status", "stage_type"].apply(check_afin))
    afines =afines.rename(columns={0:"afin"})
    unique_applications = pd.DataFrame(app_stage.application_id.drop_duplicates())
    target = unique_applications.merge(afines, left_on="application_id", right_on="application_id", how="outer")
    target["afin"] = target["afin"].fillna(0)
    
    target.to_csv(f"{path_to_target}app_id_afin.csv")
    print(f"target created at {path_to_target}app_id_afin.csv")
    return target

# Creacion de la tabla de features

### 1. Creación de features en candidates

In [10]:
def create_features_cantidates(candidates_df, videosize=None, profile_sentiment=None, profile_sentiment_fillna=None, show_logs=False):
    # this function computes some features over the Candidates dataframe
    # it would also make a merge with videosize file if there is one
    print_logs("Computing Features on candidates: ...", show_logs)
    
    candidates_ft = candidates_df.copy()
    
    # 1. videosize
    if videosize is not None:
        candidates_ft = candidates_ft.merge(videosize, left_on="id", right_on="id", how="left")
        print_logs(f"computed videosize,  {candidates_ft.shape}", show_logs)
    
    
    # 2. sentiment analysis
    if profile_sentiment is not None:
        candidates_ft = candidates_ft.merge(profile_sentiment, left_on="id", right_on="id", how="left")
        print_logs(f"computed profile sentiment,  {candidates_ft.shape}", show_logs)

    if profile_sentiment_fillna is not None:
        sentiment_fillna = profile_sentiment_fillna.rename(columns={"profile_description_sentiment":"profile_description_sentiment_fillna"})
        candidates_ft = candidates_ft.merge(sentiment_fillna, left_on="id", right_on="id", how="left")
        print_logs(f"computed profile sentiment fillna,  {candidates_ft.shape}", show_logs)
    
    # 2. longitud descripción
    candidates_ft["longitud_descripcion"] = candidates_ft["profile_description"].str.len()
    print_logs(f"computed longitud_descripcion  {candidates_ft.shape}", show_logs)
    
    # 3. Cantidad estudios
    candidates_ft["cantidad_estudios"] = candidates_ft["studies"].apply(lambda x: json.loads(x)).apply(len)
    print_logs(f"computed cantidad_estudios  {candidates_ft.shape}", show_logs)
    
    # 4. Cantidad experiencias
    candidates_ft["cantidad_experiences"] = candidates_ft["experiences"].apply(lambda x: json.loads(x)).apply(len)
    print_logs(f"computed cantidad_experiences  {candidates_ft.shape}", show_logs)
    
    # 5. birth year
    candidates_ft["birth_year"] = pd.to_datetime(candidates_ft["birthdate"], errors="coerce").dt.year
    print_logs(f"computed birth_year  {candidates_ft.shape}", show_logs)
    
    # 6. Availability to move
    candidates_ft["available_to_move"] = candidates_ft["available_to_move"].fillna(0).astype(int)
    print_logs(f"changed: available to move  {candidates_ft.shape}", show_logs)
    
    # 7. Tiene video
    candidates_ft["tiene_video"] = candidates_ft.has_video.notnull().astype(int)
    
    return candidates_ft

### 2. Creación de features en vacants

In [11]:
def create_features_vacants(vacants_df, feature_cols, show_logs=False):
    
    print_logs("Computing Features on vacants: ...", show_logs)
    
    vacants_ft = vacants_df.copy()
    vacants_ft["titles_and_studies"] = vacants_ft["titles_and_studies"].str.lower()
    
    
    vacants_ft = vacants_ft[feature_cols]
    
    # create new features on original vacants_DF
    vacants_ft["publish_year"] = pd.to_datetime(vacants_df.publish_date, errors="coerce").dt.year
    
    print_logs(f"shape of vacants_features:  {vacants_ft.shape}", show_logs)
    
    return vacants_ft

### 3. Creación features mixtas

In [12]:
def create_features_mixtas(features, show_logs=False):
    
    print_logs("Computing mixed features: ...", show_logs)
    
    # 1. Features about datetime
    print_logs(f"setting publish_date to datetime", show_logs)
    features.publish_date = pd.to_datetime(features.publish_date, errors='coerce')
    
    print_logs(f"setting created at to datetime. It may require some minutes", show_logs)
    features.created_at = pd.to_datetime(features.created_at, errors='coerce')
    
    print_logs(f"dates set to datetime :D", show_logs)
    
    features["total_hours2apply"] = (features.created_at-features.publish_date).dt.total_seconds()/3600
    print_logs(f"computed total_hours2apply  {features.shape}", show_logs)
    
    features["hours2apply"] = (features.created_at-features.publish_date).dt.seconds/3600
    print_logs(f"computed hours2apply  {features.shape}", show_logs)
    
    
    
    features["age_when_applying"]=features.created_at.dt.year-features.birth_year
    print_logs(f"computed age_when_applying  {features.shape}", show_logs)
    
    
    # 2. Features about education and salary
    features["cumple_educacion"] = (features["education_level_vac"] == features["education_level"]).astype(int)
    print_logs(f"computed cumple_educacion  {features.shape}", show_logs)
    
    
    features["cumple_salario"] = ((features["salary"] >= features["min_salary"]) & 
                                    (features["salary"] <= features["max_salary"])).astype(int)
    print_logs(f"computed cumple_salario  {features.shape}", show_logs)
    
    
    ordered_education = ['Básica primaria',
                     'Bachillerato (grados 6°, 7° u 8°)',
                     'Bachillerato (grados 9°, 10° y 11°)',
                     'Bachillerato completo',
                     'Técnico',
                     'Tecnólogo',
                     'Profesional',
                     'Especialización/ Maestría',
                     'Doctorado']
    
    
    features["education_level"] = pd.Categorical(features["education_level"],
                                            categories=ordered_education,
                                            ordered=True)
    print_logs(f"computed education_level  {features.shape}", show_logs)
    
    print_logs(f"computing education_level_vac  {features.shape}", show_logs)
    features["education_level_vac"] = pd.Categorical(features["education_level_vac"],
                                                categories=ordered_education,
                                                ordered=True)
    
    
    features["education_level_number"] = features["education_level"].cat.codes.replace(-1, np.nan)
    print_logs(f"computed education_level_number  {features.shape}", show_logs)
    
    
    features["education_level_vac_number"] = features["education_level_vac"].cat.codes.replace(-1, np.nan)
    print_logs(f"computed education_level_vac_number  {features.shape}", show_logs)
    
    
    features["education_difference"] = features["education_level_number"] - features["education_level_vac_number"]
    print_logs(f"computed education_difference  {features.shape}", show_logs)
    
    return features
    

### 4. Creacion de los datasets

In [13]:
def create_dataset(train_or_test, cands, vacs, app, videosize, prof_sent, prof_sent_fillna):
    candidates_features = create_features_cantidates(cands, videosize, prof_sent, prof_sent_fillna, True)
    vacants_feature_cols = [
        "min_salary", 
        "max_salary", 
        "salary_type", 
        "education_level_vac",
        "experience_and_positions",
        "knowledge_and_skills",
        "titles_and_studies",
        "number_of_quotas",
        "publish_date"
    ]
    vacants_features = create_features_vacants(vacs, vacants_feature_cols, True)
    features = app[[ "vacant_id", "candidate_id", "created_at"]].reset_index()

    # rename the id column to application_id
    features = features.rename(columns={"id": "application_id"})
    # merge the base of features with candidates and vacants
    features = features.merge(candidates_features, left_on= "candidate_id",  right_on="id")
    features = features.merge(vacants_features, left_on= "vacant_id",  right_on="id")
    features = create_features_mixtas(features, True)
    cols2delete = [
        "email", 
        "first_name", 
        "last_name", 
        "phone", 
        "profile_description", 
        "has_video",
        "studies",
        "experiences",
        "psy_tests",
        "identification_number",
        "country_birth",
        "birthdate",
        "civil_status",
        "title_or_profession",
        "created_at",
        "publish_date"
    ]
    features= features.drop(cols2delete, axis=1)
    if train_or_test == "train":
        target = create_target(path2target)
        dataset = features.merge(target, left_on="application_id", right_on="application_id", how="right")
    else:
        dataset = features
        
    return dataset

### Correr las funciones

In [14]:
train_dataset = create_dataset("train", candidates_df, vacants_df, application_df, videosize_df, train_profile_sentiment, train_profile_sentiment_fillna)

Computing Features on candidates: ...
computed videosize,  (548364, 23)
computed profile sentiment,  (548364, 24)
computed profile sentiment fillna,  (548364, 25)
computed longitud_descripcion  (548364, 26)
computed cantidad_estudios  (548364, 27)
computed cantidad_experiences  (548364, 28)
computed birth_year  (548364, 29)
changed: available to move  (548364, 29)
Computing Features on vacants: ...
shape of vacants_features:  (11693, 10)
Computing mixed features: ...
setting publish_date to datetime
setting created at to datetime. It may require some minutes
dates set to datetime :D
computed total_hours2apply  (2120287, 45)
computed hours2apply  (2120287, 46)
computed age_when_applying  (2120287, 47)
computed cumple_educacion  (2120287, 48)
computed cumple_salario  (2120287, 49)
computed education_level  (2120287, 49)
computing education_level_vac  (2120287, 49)
computed education_level_number  (2120287, 50)
computed education_level_vac_number  (2120287, 51)
computed education_differen

  


target created at ../targets/app_id_afin.csv


In [15]:
test_dataset = create_dataset("test", candidates_test_df, vacants_test_df, application_test_df, videosize_test_df, test_profile_sentiment, test_profile_sentiment_fillna)

Computing Features on candidates: ...
computed videosize,  (119380, 23)
computed profile sentiment,  (119380, 24)
computed profile sentiment fillna,  (119380, 25)
computed longitud_descripcion  (119380, 26)
computed cantidad_estudios  (119380, 27)
computed cantidad_experiences  (119380, 28)
computed birth_year  (119380, 29)
changed: available to move  (119380, 29)
Computing Features on vacants: ...
shape of vacants_features:  (11111, 10)
Computing mixed features: ...
setting publish_date to datetime
setting created at to datetime. It may require some minutes
dates set to datetime :D
computed total_hours2apply  (664554, 45)
computed hours2apply  (664554, 46)
computed age_when_applying  (664554, 47)
computed cumple_educacion  (664554, 48)
computed cumple_salario  (664554, 49)
computed education_level  (664554, 49)
computing education_level_vac  (664554, 49)
computed education_level_number  (664554, 50)
computed education_level_vac_number  (664554, 51)
computed education_difference  (6645

In [16]:
train_dataset.head()

Unnamed: 0,application_id,vacant_id,candidate_id,gender,identification_type,city,education_level,salary,without_experience,without_studies,...,publish_year,total_hours2apply,hours2apply,age_when_applying,cumple_educacion,cumple_salario,education_level_number,education_level_vac_number,education_difference,afin
0,33,2,6,,0.0,Medellin,,,False,False,...,2017.0,0.043611,0.043611,21.0,0,0,,,,1.0
1,34,2,7,,0.0,Medellin,,,False,False,...,2017.0,0.043611,0.043611,28.0,0,0,,,,1.0
2,35,2,8,,0.0,Cravo Norte,Bachillerato completo,,False,False,...,2017.0,0.043611,0.043611,26.0,0,0,3.0,,,1.0
3,36,2,1,,1.0,Puerto Narino,Bachillerato completo,,False,False,...,2017.0,0.043611,0.043611,26.0,0,0,3.0,,,0.0
4,37,2,9,,1.0,Medellin,Bachillerato completo,,False,False,...,2017.0,0.043611,0.043611,23.0,0,0,3.0,,,0.0


In [17]:
test_dataset.head()

Unnamed: 0,application_id,vacant_id,candidate_id,gender,identification_type,city,education_level,salary,without_experience,without_studies,...,number_of_quotas,publish_year,total_hours2apply,hours2apply,age_when_applying,cumple_educacion,cumple_salario,education_level_number,education_level_vac_number,education_difference
0,21,1,3,female,0.0,Bogota,Profesional,,False,False,...,,2017.0,17.701667,17.701667,25.0,0,0,6.0,,
1,22,1,4,male,0.0,Abriaqui,Profesional,,False,False,...,,2017.0,17.701944,17.701944,25.0,0,0,6.0,,
2,23,1,5,male,0.0,Medellin,Técnico,,False,False,...,,2017.0,17.701944,17.701944,21.0,0,0,4.0,,
3,30,2,3,female,0.0,Bogota,Profesional,,False,False,...,,2017.0,0.043333,0.043333,25.0,0,0,6.0,,
4,31,2,4,male,0.0,Abriaqui,Profesional,,False,False,...,,2017.0,0.043611,0.043611,25.0,0,0,6.0,,


### Guardar los datasets

In [18]:
train_dataset.to_pickle(path2features+"train_dataset.pkl")
test_dataset.to_pickle(path2features+"test_dataset.pkl")