# Input Data Sources

This notebook shows an example of how to load raw data from a CSV file and how to define metadata. Other solutions are possible, including loading data and metadata from a databases or from binary files, depending on an organization's information system.

In [10]:
%load_ext autoreload
%autoreload 2

# global imports
import pandas as pd
import numpy as np
from ast import literal_eval
# local imports
from findhr.preprocess.metadata import JSONMetadata, validate_schema

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# Load CDS (Candidate Data Source) DataFrame from a CSV file. 
# Here, the CSV file is a sample of the interaction log synthetic data generated in the FINDHR project.
# The converters argument is specific of an organization's information system, because it depends on the source format of raw data.
# Only candidate_id is mandatory, all other columns are features of the candidate available in the organization's information system.

df_CDS = pd.read_csv('../../../data/upf_gen/candidates.csv', index_col=0,
            converters={'professional_experience_c': literal_eval,
                        'education_background_c': literal_eval,
                        'skills_c': literal_eval})
df_CDS.head()

Unnamed: 0,id_c,education_background_c,professional_experience_c,skills_c,gender_c,agg_perceived_foreign_c
5,5,[{'institution': 'Complutense University Of Ma...,"[{'institution': 'Stylo Milano', 'start_date':...","[Communications, Social Integration, Microsoft...",Man,No
6,6,[{'institution': 'Coronel Rosales Agricultural...,"[{'institution': 'Securitas Direct', 'start_da...","[Refinancing, Economy, Microsoft Excel, Collec...",Man,No
10,10,[{'institution': 'Complutense University Of Ma...,"[{'institution': 'Carrefour Express', 'start_d...","[Entrepreneurship, Literacy, Web Design, Adobe...",Woman,No
11,11,"[{'institution': 'Les Ribera De Los Molinos', ...","[{'institution': 'Decimas Sl', 'start_date': '...","[Consulting, Sap Crm, Collections, Automation,...",Woman,No
15,15,[{'institution': 'Escuela Politcnica Superior ...,"[{'institution': 'Reintegrate', 'start_date': ...","[Microsoft Word, Biofuels, English, Entreprene...",Man,No


In [25]:
# Load JDS (Job Data Source) DataFrame from a CSV file. 
# Here, the CSV file is a sample of the interaction log synthetic data generated in the FINDHR project.
# The converters argument is specific of an organization's information system, because it depends on the source format of raw data.
# Only job_id is mandatory, all other columns are features of the job available in the organization's information system.

df_JDS = pd.read_csv('../../../data/upf_gen/jobs.csv', index_col=0,
                 converters={'experience_reqs_duration_j': literal_eval,
                             'experience_reqs_role_j': literal_eval,
                             'education_reqs_j': literal_eval,
                             'skills_j': literal_eval
                             })
df_JDS.head()

Unnamed: 0,id_j,education_reqs_j,experience_reqs_role_j,experience_reqs_duration_j,skills_j,gender_j,agg_perceived_foreign_j
0,1,[Bachelor's Degree in Information Society Tech...,"[Data Analyst, Analyst, Embedded Software Engi...",12,"[English, Teamwork, Microsoft Office, Python (...",Man,No
1,2,[Graduated In Administration And Management Cf...,"[Administrative, Administrative Assistant, Ass...",36,"[English, Management, Planning, Microsoft Exce...",Man,No
2,3,[],"[Sales Assistant, Saleswoman, Commercial Advisor]",12,"[English, Spanish, Communications, Communicati...",Man,No
3,4,[Graduated In Administration And Management Cf...,"[Project Manager, Project Leader]",12,"[Organization, Management, Teamwork, Planning,...",Man,No
4,5,"[Law Bachelor, Degree In Law, Higher Degree In...",[Consultant],12,"[Punctuality, Organization, Accounting, Englis...",Man,No


In [41]:
# Create fictious ADS (Application Data Source) DataFrame (not present in the interaction log synthetic data).
# first 10 candidates apply to first job position, second 10 candidates apply to second job position
# Only job_id, candidate_id and *one* target feature are mandatory.
TOP_K = 10
df_ADS = pd.read_csv('../../../data/upf_gen/ADS.csv', index_col=0)
# ranking is based on descending score
df_ADS['ranking'] = df_ADS.groupby('id_j')['score'].rank('dense', ascending=False).astype(int)
# shortlisted is top10 based on descending score
df_ADS['shortlisted'] = (df_ADS['ranking'] <= TOP_K).astype(int)
df_ADS

Unnamed: 0,id_c,id_j,score,ranking,shortlisted
0,6,3,0.000000,55,0
1,11,3,0.492754,25,0
3,15,3,0.453089,36,0
4,17,3,0.049689,49,0
5,19,3,0.080268,46,0
...,...,...,...,...,...
1537,2204,1,0.550725,13,0
1538,2211,1,0.550725,13,0
1539,2213,1,0.478261,35,0
1540,2214,1,0.478261,35,0


In [35]:
# Metadata for CDS is a dictionary mapping column names to JSONMetadata object
md_CDS = {
    'id_c': JSONMetadata(schema={'type': 'number'}),
    'education_background_c' : JSONMetadata(schema={'type': 'array',
                                                  'items': {'type': 'object',
                                                            'properties': {'institution' : {'type': "string"},
                                                                           'end_date': {'type': "string"},
                                                                           'degree': {'type': "string"},
                                                                           'duration': {'type': "string"}
                                                                           }
                                                            }
                                                  },
                                           attr_type='object',
                                           attr_usage='default'
                                                            ),
    'professional_experience_c':  JSONMetadata(schema={'type': 'array',
                                                                       'items': {'type': 'object',
                                                                              'properties': {'institution' : {'type': "string"},
                                                                                   'end_date': {'type': "string"},
                                                                                   'role': {'type': "string"},
                                                                                   'duration': {'type': "string"}
                                                                                   }
                                                                              }
                                                                    },
                                                             attr_type='object',
                                                             attr_usage='default'
                                                               ),
    'skills_c': JSONMetadata(schema={'type': "array",
                                     'items': {'type': 'string'}
                                     },
                             attr_type='object',
                             attr_usage='default'
                             ),
    'gender_c': JSONMetadata(schema={'enum': ['Man', 'Woman', 'Any']},
                             attr_type='category',
                             attr_usage='sensitive'),
    'agg_perceived_foreign_c': JSONMetadata(schema={'enum': ['No', 'Yes', 'Any']},
                                            attr_type='category',
                                            attr_usage='sensitive'),
                                            }

In [36]:
# Metadata for JDS is a dictionary mapping column names to JSONMetadata object.
md_JDS = {
    'id_j': JSONMetadata(schema={'type': 'number'}),
    'education_reqs_j': JSONMetadata(schema={'type': "array",
                                     'items': {'type': 'string'}
                                     },
                                     attr_type='object',
                                     attr_usage='default'
                             ),
    'experience_reqs_role_j': JSONMetadata(schema={'type': "array",
                                                   'items': {'type': 'string'}
                                                   },
                                           attr_type='object',
                                           attr_usage='default'
                             ),
    'experience_reqs_duration_j': JSONMetadata(schema={'type': 'number'},
                                               attr_type='object',
                                               attr_usage='default'),
    'skills_j': JSONMetadata(schema={'type': "array",
                                     'items': {'type': 'string'}
                                     },
                             attr_type='object',
                             attr_usage='default'
                             ),
    'gender_j': JSONMetadata(schema={'enum': ['Man', 'Woman', 'Any']},
                             attr_type='category',
                             attr_usage='sensitive'),
    'agg_perceived_foreign_j': JSONMetadata(schema={'enum': ['No', 'Yes', 'Any']},
                                            attr_type='category',
                                            attr_usage='sensitive'),
}

In [37]:
# Metadata for ADS is a dictionary mapping column names to JSONMetadata object
md_ADS = {
    'score': JSONMetadata(schema={'type': 'number'}, attr_type='numeric', attr_usage='target'),
    'ranking': JSONMetadata(schema={'type': 'integer'}, attr_type='ordinal', attr_usage='target'),
    'shortlisted': JSONMetadata(schema={'type': 'integer'}, attr_type='category', attr_usage='target'),
}

In [38]:
# Uncomment to Validate JSON schema.
# print('{} Validating schema for df_CDS {}'.format('-' * 20, '-' * 20))
# validate_schema(df_CDS, md_CDS)
# print('{} Validating schema for df_JDS {}'.format('-' * 20, '-' * 20))
# validate_schema(df_JDS, md_JDS)
# print('{} Validating schema for df_ADS {}'.format('-' * 20, '-' * 20))
# validate_schema(df_ADS, md_ADS)

In [39]:
# Join metadata (assume no clash of column names).
md_all =  {**md_CDS, **md_JDS, **md_ADS}
md_all

{'id_c': 
 	SCHEMA = {'type': 'number'}
 	ATTR_TYPE = object
 	ATTR_USAGE = default
 	KNOWLEDGE_BASE = None,
 'education_background_c': 
 	SCHEMA = {'type': 'array', 'items': {'type': 'object', 'properties': {'institution': {'type': 'string'}, 'end_date': {'type': 'string'}, 'degree': {'type': 'string'}, 'duration': {'type': 'string'}}}}
 	ATTR_TYPE = object
 	ATTR_USAGE = default
 	KNOWLEDGE_BASE = None,
 'professional_experience_c': 
 	SCHEMA = {'type': 'array', 'items': {'type': 'object', 'properties': {'institution': {'type': 'string'}, 'end_date': {'type': 'string'}, 'role': {'type': 'string'}, 'duration': {'type': 'string'}}}}
 	ATTR_TYPE = object
 	ATTR_USAGE = default
 	KNOWLEDGE_BASE = None,
 'skills_c': 
 	SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
 	ATTR_TYPE = object
 	ATTR_USAGE = default
 	KNOWLEDGE_BASE = None,
 'gender_c': 
 	SCHEMA = {'enum': ['Man', 'Woman', 'Any']}
 	ATTR_TYPE = category
 	ATTR_USAGE = sensitive
 	KNOWLEDGE_BASE = None,
 'agg_perceived_f

In [40]:
# Join raw data.
df_all = pd.merge(df_JDS, df_ADS, on='id_j')
df_all = pd.merge(df_CDS, df_all, on='id_c')
df_all.head()

Unnamed: 0,id_c,education_background_c,professional_experience_c,skills_c,gender_c,agg_perceived_foreign_c,id_j,education_reqs_j,experience_reqs_role_j,experience_reqs_duration_j,skills_j,gender_j,agg_perceived_foreign_j,score,ranking,shortlisted
0,5,[{'institution': 'Complutense University Of Ma...,"[{'institution': 'Stylo Milano', 'start_date':...","[Communications, Social Integration, Microsoft...",Man,No,5,"[Law Bachelor, Degree In Law, Higher Degree In...",[Consultant],12,"[Punctuality, Organization, Accounting, Englis...",Man,No,0.0,138,0
1,6,[{'institution': 'Coronel Rosales Agricultural...,"[{'institution': 'Securitas Direct', 'start_da...","[Refinancing, Economy, Microsoft Excel, Collec...",Man,No,3,[],"[Sales Assistant, Saleswoman, Commercial Advisor]",12,"[English, Spanish, Communications, Communicati...",Man,No,0.0,89,0
2,10,[{'institution': 'Complutense University Of Ma...,"[{'institution': 'Carrefour Express', 'start_d...","[Entrepreneurship, Literacy, Web Design, Adobe...",Woman,No,5,"[Law Bachelor, Degree In Law, Higher Degree In...",[Consultant],12,"[Punctuality, Organization, Accounting, Englis...",Man,No,0.492754,55,0
3,11,"[{'institution': 'Les Ribera De Los Molinos', ...","[{'institution': 'Decimas Sl', 'start_date': '...","[Consulting, Sap Crm, Collections, Automation,...",Woman,No,3,[],"[Sales Assistant, Saleswoman, Commercial Advisor]",12,"[English, Spanish, Communications, Communicati...",Man,No,0.492754,35,0
4,15,[{'institution': 'Escuela Politcnica Superior ...,"[{'institution': 'Reintegrate', 'start_date': ...","[Microsoft Word, Biofuels, English, Entreprene...",Man,No,3,[],"[Sales Assistant, Saleswoman, Commercial Advisor]",12,"[English, Spanish, Communications, Communicati...",Man,No,0.453089,49,0
