In [None]:
#!pip install langdetect
#!pip install pyLDAvis
#!pip install spacy==3.0.5
#!pip install spacy-spanish-lemmatizer==0.6
#!spacy download es_core_news_sm
#!python3 -m spacy_spanish_lemmatizer download wiki
#!pip install google_trans_new

In [1]:
import langdetect  # language detection
import google_trans_new as gt
import nltk  # natural language processing
from nltk.corpus import words
from nltk.stem.wordnet import WordNetLemmatizer
import pyLDAvis  # plotting
import pyLDAvis.sklearn  # plotting
import sklearn  # machine learning
import numpy  # arrays and matrices
import pandas  # dataframes
import regex  # regular expressions
import string
from unicodedata import normalize
import spacy
import spacy_spanish_lemmatizer
from html import unescape
from numbers import Number
from joblib import Parallel, delayed
from pytictoc import TicToc

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [2]:
# added to suppress warnings coming from pyLDAvis
import warnings
warnings.filterwarnings('ignore')

# plotting
import matplotlib.pyplot
%matplotlib inline

# init time counter
tictoc = TicToc()

# init translator
translator = gt.google_translator()

  and should_run_async(code)


In [3]:
# installing specific word dictionarys
# used for stopword removal and lemmatization

nltk.download("words")
nltk.download("wordnet")
nltk.download("stopwords")

nlp = spacy.load("es_core_news_sm")
nlp.replace_pipe("lemmatizer", "spanish_lemmatizer")

stopwords = set(pandas.read_csv("stopwords/stopwords.csv").word.tolist())
stopwords = set([i for i in stopwords if type(i) == str])
print(f'Cantidad de stopwords: {len(stopwords)}')

[nltk_data] Downloading package words to /home/alexander/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/alexander/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alexander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cantidad de stopwords: 608


In [4]:
path = 'argos_vacantes.xlsx'
df = pandas.read_excel(path, sheet_name="Hoja1")
print("SHAPE:\n{shape}\n".format(shape=df.shape))
print("COLUMN NAMES:\n{names}\n".format(names=df.columns.tolist()))
df.head()

SHAPE:
(15, 5)

COLUMN NAMES:
['Encargado', 'Vacante', 'Descripción vacante', 'Requerimientos', 'idioma']



Unnamed: 0,Encargado,Vacante,Descripción vacante,Requerimientos,idioma
0,Tomas,ESTUDIANTE DE PRACTICA EN COMUNICACION SOCIAL ...,"Redactar, editar y publicar información releva...",Comunicación social y periodismo,español
1,Tomas,TERRITORY SALES MANAGER JOB (grupoargos.com),The Territory Sales Manager is an individual c...,English Office,ingles
2,Tomas,SUPERVISION OMM REDUCCION DE PERDIDAS (grupoar...,"Programar, supervisar la ejecución de las acci...",Tecnología electrica,español
3,Tomas,PRODUCTION MANAGER II JOB (grupoargos.com),To manage the production department so that al...,Experience in cement and process manufacturing...,ingles
4,Tomas,ESTRUCTURACION PROYECTOS INTEGRALES (grupoargo...,Liderar la estructuración e implementación de ...,"Ingenieria, Arquitectura, Administracion \n...",español


In [6]:
descriptions = df['Descripción vacante'].tolist()
print("DESCRIPTIONS:\n{lines}\n".format(lines=descriptions[:1]))
requirements = df['Requerimientos'].tolist()
print("DESCRIPTIONS:\n{lines}\n".format(lines=requirements[:1]))

DESCRIPTIONS:
['Redactar, editar y publicar información relevante en los diferentes medios de difusión. Diseñar piezas gráficas para diferentes eventos, realización de campañas de expectativa']

DESCRIPTIONS:
['Comunicación social y periodismo']



In [5]:
# check language of headline, filter to spanish only
def do_language_translate(txt):
    lg = 'none'
    try: 
        lg = langdetect.detect(txt)
    except: 
        pass
    if lg == 'en':
        return translator.translate(txt,lang_src='en',lang_tgt='es')
    elif lg != 'es': 
        return ''
    return txt

def do_clean_html(txt):
    cleantxt = unescape(str(txt))
    cleanr = regex.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return regex.sub(cleanr, ' ', cleantxt)

def do_clean_stopwords(txt):
    out = [t for t in txt.split(" ") if t.lower() not in stopwords]
    return " ".join(out)

# define function to cleaning data
def do_clean_text(txt):
    txt = txt.lower()
    
    # Remove text in square brackets, remove punctuation and remove words containing numbers.
    #txt = txt.replace("ñ", "nh")
    txt = regex.sub(r"\[.*?¿¡!·\]\%", " ", txt)
    txt = regex.sub(r"https://", "", txt)
    txt = regex.sub(r"www.", "", txt)
    txt = regex.sub(r"[‘’“”…«»]", " ", txt)
    txt = regex.sub(r'["\[\]\{\}]', " ", txt)
    txt = regex.sub(r"_", " ", txt)
    txt = regex.sub(r"[%s]" % regex.escape(string.punctuation), " ", txt)
    txt = regex.sub(r"\w*\d\w*", " ", txt)
    txt = regex.sub(
        r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
        normalize("NFD", txt), 0, regex.I
    )
    txt = regex.sub(r"\n", " ", txt)
    txt = regex.sub(r"&lt;/?.*?&gt;", " &lt;&gt; ", txt)

    # Eliminar caracteres que se repitan más de dos veces seguidas
    pattern = regex.compile(r"(.)\1{2,}")
    txt = pattern.sub(r"\1\1", txt)
    
    return txt.strip(' ')

# define function to perform lemmatization
def do_lemmatizing(txt):
    out = [word.lemma_ for word in nlp(txt)]
    return " ".join(out)

def do_clean_blanks(tokens):
    tokens = [str(i).strip(' ') for i in tokens]
    tokens = [i for i in tokens if len(i) >= 3]
    return tokens

def do_split_tokens(txt):
    return txt.split(' ')

# define function to execute pipeline pre-processing
def do_prepare_pipeline(element, pipeline):
    inout_data = element if not isinstance(element, Number) else str(element)
    for transform in pipeline:
        inout_data = transform(inout_data)
    return inout_data if inout_data and len(inout_data) > 0 else None

In [7]:
# define pipeline to text pre-processing
pipeline = [
    do_clean_html,
    do_language_translate,
    do_clean_stopwords,
    #do_lemmatizing,
    do_clean_text,
    do_split_tokens,
    do_clean_blanks,
]

print(do_prepare_pipeline(descriptions[:1], pipeline))

['redactar', 'editar', 'publicar', 'informacion', 'relevante', 'medios', 'difusion', 'diseñar', 'piezas', 'graficas', 'eventos', 'realizacion', 'campañas', 'expectativa']


In [8]:
# apply cleaning function to every vacancy
tictoc.tic()
clean_descriptions = Parallel(n_jobs=24)(delayed(do_prepare_pipeline)(txt, pipeline) for txt in descriptions)
tictoc.toc('Elapsed time')

Elapsed time 4.427058 seconds.


In [9]:
# apply cleaning function to every vacancy
tictoc.tic()
clean_requirements = Parallel(n_jobs=24)(delayed(do_prepare_pipeline)(txt, pipeline) for txt in requirements)
tictoc.toc('Elapsed time')

Elapsed time 2.409752 seconds.


In [10]:
clean_sentences1 = [" ".join(i) if 'list' in str(type(i)) else '' for i in clean_descriptions]
print("CLEAN DESCRIPTIONS:\n{lines}\n".format(lines=clean_sentences1[:1]))

clean_sentences2 = [" ".join(i) if 'list' in str(type(i)) else '' for i in clean_requirements]
print("CLEAN REQUIREMENTS:\n{lines}\n".format(lines=clean_sentences2[:1]))

CLEAN DESCRIPTIONS:
['redactar editar publicar informacion relevante medios difusion diseñar piezas graficas eventos realizacion campañas expectativa']

CLEAN REQUIREMENTS:
['comunicacion social periodismo']



In [11]:
# save to backup data cleaned
clean_df = df.copy()
clean_df['new_description'] = clean_sentences1
clean_df['new_requirements'] = clean_sentences2
clean_df.to_excel("argos_vacantes_clean.xlsx", sheet_name="Data", encoding="utf-8")
clean_df.head()

Unnamed: 0,Encargado,Vacante,Descripción vacante,Requerimientos,idioma,new_description,new_requirements
0,Tomas,ESTUDIANTE DE PRACTICA EN COMUNICACION SOCIAL ...,"Redactar, editar y publicar información releva...",Comunicación social y periodismo,español,redactar editar publicar informacion relevante...,comunicacion social periodismo
1,Tomas,TERRITORY SALES MANAGER JOB (grupoargos.com),The Territory Sales Manager is an individual c...,English Office,ingles,gerente ventas territorio papel contribuyente ...,oficina ingles
2,Tomas,SUPERVISION OMM REDUCCION DE PERDIDAS (grupoar...,"Programar, supervisar la ejecución de las acci...",Tecnología electrica,español,programar supervisar ejecucion acciones plan r...,tecnologia electrica
3,Tomas,PRODUCTION MANAGER II JOB (grupoargos.com),To manage the production department so that al...,Experience in cement and process manufacturing...,ingles,administrar departamento produccion secciones ...,experiencia fabricacion cemento procesos indus...
4,Tomas,ESTRUCTURACION PROYECTOS INTEGRALES (grupoargo...,Liderar la estructuración e implementación de ...,"Ingenieria, Arquitectura, Administracion \n...",español,liderar estructuracion implementacion proyecto...,ingenieria arquitectura administracion especia...


In [14]:
clean_df[clean_df['idioma']=='ingles']['new_description'].tolist()

['gerente ventas territorio papel contribuyente individual gestion territorio responsabilidades tecnicas basicas responsabilidades generales funcion desarrollar mantener estrategia empresarial diseñada obtener proteger participacion mercado cuentas territorio brindar servicio tecnico basico clientes argos liderar esfuerzos marketing ventas territorio ventas monitorear calidad rendimiento argos productos competitivos territorio crea estrategia comercial clara cuentas canales clave por ejemplo aumentar participacion mamposteria penetracion cuenta proteger base existente etc desarrolla objetivos ingresos productos realistas alcanzables entiende ofrecer clientes enfasis clave canal paquete identifica entiende paisaje competitivo desarrolla planes maximizar potencial cuentas utiliza practicas gestion cuentas implementa estrategias distrito decir logro precio promueve productos argos territorio enfasis segmentos mamposteria estuco proporciona asistencia tecnica basica utiliza recursos neces

In [15]:
clean_df[clean_df['idioma']=='ingles']['new_requirements'].tolist()

['oficina ingles',
 'experiencia fabricacion cemento procesos industria relacionada ingles',
 'grado asociados relacionados campo construccion negocio prefiere licenciatura conocimiento laboral concreto agregado productos especiales problemente aplicaciones microsoft especificamente excel word powerpoint conocimientos tecnicos certificaciones capacitacion experiencia ventas construccion exposicion gestion proyectos ventas estimacion años experiencia relacionada campo ventas construccion ingles',
 'grado asociado fabricacion cemento pantallas hmi aspen pabellon foco plantas sap fabricacion cemento ingles',
 'licenciatura investigacion marketing economia matematicas aplicadas estadisticas disciplinas relacionadas ciencia investigacion mercado analitica finanzas negocio analisis negocio inteligencia negocios big data crm excel avanzado powerpoint avanzado power avanzado marketing analisis negocio inteligencia negocios datos ingles nativo avanzado',
 'oficina ventas construccion ingles']