In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split


from unicodedata import normalize

In [2]:
avisos_detalle = pd.read_csv('datos_navent_fiuba/d15_fiuba_6_avisos_detalle.csv')

avisos_detalle2 = pd.read_csv('datos_navent_fiuba/fiuba_6_avisos_detalle.csv')

avisos_detalle = pd.merge(avisos_detalle,avisos_detalle2,how='outer')

avisos_detalle2 = pd.read_csv('datos_navent_fiuba/h15_fiuba_6_avisos_detalle.csv')

avisos_detalle = pd.merge(avisos_detalle,avisos_detalle2,how='outer')

avisos_detalle.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25563 entries, 0 to 25562
Data columns (total 11 columns):
idaviso                 25563 non-null int64
idpais                  25563 non-null int64
titulo                  25563 non-null object
descripcion             25563 non-null object
nombre_zona             25563 non-null object
ciudad                  137 non-null object
mapacalle               1854 non-null object
tipo_de_trabajo         25563 non-null object
nivel_laboral           25563 non-null object
nombre_area             25563 non-null object
denominacion_empresa    25556 non-null object
dtypes: int64(2), object(9)
memory usage: 2.3+ MB


In [3]:
train = pd.read_csv('modelo_final.csv')
train.columns

Index(['idaviso', 'idpostulante', 'sepostulo'], dtype='object')

In [4]:
avisos_detalle_txt = pd.merge(train, avisos_detalle, on = 'idaviso')
avisos_detalle_txt = avisos_detalle_txt[['idaviso','descripcion']]

In [5]:
avisos_detalle_txt.sample(50)

Unnamed: 0,idaviso,descripcion
515777,1112019285,"<p style=""""><a></a><span style="""">Una importan..."
891026,1112255878,<p>IMPORTANTE EMPRESA DE RUBRO GASTRONOMICO SE...
754823,1111662226,<p>En Farmacia Central Oeste estamos buscando ...
58087,1112441146,"<p>Prosegur Alarmas, empresa multinacional líd..."
542632,1112407510,"<p><span style="""">Importante empresa en consta..."
149913,1112461857,<p>Importante empresa de transporte de pasajer...
619887,1111373567,<p>GEDCO S.A. es una empresa líder en la gesti...
119254,1112312626,<p>Grupo Cosmos Recursos Humanos es una compañ...
500224,1112437699,"<p style="""">En <strong style="""">Prosumia</stro..."
69203,1112345052,"<p>Nos encontramos en la búsqueda de, Enfermer..."


In [6]:
avisos_detalle_txt.sort_values('idaviso',ascending = False)
avisos_detalle_txt.drop_duplicates('idaviso',inplace= True)

In [7]:
def textClean(text):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
        normalize( "NFD", text), 0, re.I)
    text = normalize( 'NFC', text)
    text = re.sub(r"[,!.\/'+-=]", " ", text)
    text = text.lower().split()
    stops = {'para', 'de', 'en', 'presentarse', 'entrevista', 
             'y', 'a','trabajar','o','as','zona','te','estamos','al'
            'buscando','del','importante','hs','horario','tu','como',
            'al','con','por','nueva','la','e','os', 'es','su','un','una'}
    text = [w for w in text if not w in stops]    
    text = " ".join(text)
    text = text.replace("."," ").replace(","," ").replace("("," ").replace(")"," ")
    text = text.replace("/"," ").replace("-"," ").replace("!"," ").replace("¡"," ")
    text = text.replace("\""," ").replace("*"," ").replace(">"," ")
    return(text)

In [8]:
avisos_detalle_txt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24950 entries, 0 to 992320
Data columns (total 2 columns):
idaviso        24950 non-null int64
descripcion    24950 non-null object
dtypes: int64(1), object(1)
memory usage: 584.8+ KB


In [9]:
descripciones = avisos_detalle_txt['descripcion']

len(descripciones)

24950

In [10]:
trainText = []
for it in avisos_detalle_txt['descripcion']:
    newT = textClean(it)
    trainText.append(newT)


In [11]:
text = pd.DataFrame(trainText,columns=['descripcion'])
text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24950 entries, 0 to 24949
Data columns (total 1 columns):
descripcion    24950 non-null object
dtypes: object(1)
memory usage: 195.0+ KB


In [12]:
avisos_detalle_txt.reset_index(inplace=True)
avisos_detalle_txt['descripcion'] = text['descripcion']

In [13]:
avisos_detalle_txt.drop(columns=['index'],axis=1,inplace=True)

In [14]:
avisos_detalle_txt.head()

Unnamed: 0,idaviso,descripcion
0,1112258266,analista tecnico funcional seniornos encontram...
1,1112260855,solutix esta busqueda developers java ssr sr i...
2,1112366633,it resources s se encuentra busqueda selectora...
3,1112399003,lugar titrae talento compañia global consultor...
4,1112295735,empresa lider mercado seleccionara pasante com...


In [15]:
final = pd.merge(avisos_detalle_txt,train, on = 'idaviso',how='outer')

In [16]:
final.dropna(axis=0)
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
idaviso         1000000 non-null int64
descripcion     957242 non-null object
idpostulante    1000000 non-null object
sepostulo       1000000 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 38.1+ MB


In [17]:
final.sample(20)

Unnamed: 0,idaviso,descripcion,idpostulante,sepostulo
88009,1112320477,nuestra sucursal alto rosario seleccionaremos ...,xkpaZVP,1.0
348074,1112365964,empresa dedicada servicio tecnico alta tecnolo...,wVY593p,1.0
476622,1112513185,sumate nuestro equipo planeta zenok incorporar...,5mlYR2v,0.0
624493,1112320527,broker seguros se encuentra busqueda asesor te...,96XOlMY,1.0
453474,1112438799,empresa se encuentra busqueda analista interfa...,ERxoaN,1.0
538018,1112410652,cia indumentaria incorpora staff permanente ve...,Dr6w4rX,1.0
95065,1112443758,sofrecom argentina empresa grupo orange casa m...,PmqoeVb,0.0
807856,1112343774,compañia marketing nos encontramos busqueda te...,pzMbYx3,0.0
777932,1112381476,par sanatorio buscamos medico guardia asistir ...,xkP56Dz,0.0
946810,1112415796,garbarino nos encontramos busqueda asistente c...,ak4o655,1.0


In [18]:
test_s = 0.25
random_s = 0
features= [c for c in df.columns.values if c  not in ['idaviso','idpostulante','sepostulo']]
numeric_features= [c for c in df.columns.values if c  not in ['idaviso','idpostulante','sepostulo','descripcion']]
target = 'sepostulo'

x_train, x_test, y_train, y_test = train_test_split(x[features], y[target], test_size=test_s, random_state=random_s)

In [31]:
from sklearn.base import BaseEstimator, TransformerMixin


class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


numeric = Pipeline([
                ('selector', NumberSelector(key='descripcion')),),
                ('', )
            ])

text = Pipeline([
                ('descripcion', cust_txt_col('descripcion')),
                ('tfidf', TfidfVectorizer(sublinear_tf=True,ngram_range=(1,3), max_df=0.5)),
                ('tsvd3', TruncatedSVD(n_iter=25, random_state=12))
        ])

In [26]:

from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)