In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split


from unicodedata import normalize

In [2]:
modelo = pd.read_csv('modelo_final.csv')
modelo.columns

  interactivity=interactivity, compiler=compiler, result=result)


Index(['idaviso', 'idpostulante', 'rango_edad', 'sexo', 'nivel_estudios',
       'esta_estudiando', 'descripcion', 'tipo_de_trabajo', 'nivel_laboral',
       'nombre_zona', 'nombre_area', 'sepostulo'],
      dtype='object')

In [3]:
avisos_detalle_txt = modelo[['idaviso','descripcion']]

In [4]:
avisos_detalle_txt.sample(50)

Unnamed: 0,idaviso,descripcion
761550,1112325200,"<p><span style="""">Incorporamos jóvenes con exp..."
688349,1112349943,<p><strong>Importante empresa de la Zona de Ca...
362550,1112480684,<p>Desarrollador <strong>SQL / PL-SQL Ssr</str...
139971,1112349092,<p>Seleccionamos un Sr. Sys. Admin. para impor...
846251,1112292851,<p>Nos orientamos a personas con capacidad par...
414405,1112404144,"<p>Nuestro cliente, compañía de indumentaria c..."
701354,1112393056,<p>Nos encontramos en la búsqueda de vendedore...
421964,1112387021,"<p style=""""><strong><span style="""">Adecco Offi..."
191476,1112242390,<p>VICTOR GULLO SRL - EMPRESA DE MATERIALES P...
957293,1112326501,<p><strong>Requisitos mínimos</strong></p><p>B...


In [5]:
avisos_detalle_txt.sort_values('idaviso',ascending = False)
avisos_detalle_txt.drop_duplicates('idaviso',inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [6]:
def textClean(text):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
        normalize( "NFD", text), 0, re.I)
    text = normalize( 'NFC', text)
    text = re.sub(r"[,!.\/'+-=]", " ", text)
    text = text.lower().split()
    stops = {'para', 'de', 'en', 'presentarse', 'entrevista', 
             'y', 'a','trabajar','o','as','zona','te','estamos','al'
            'buscando','del','importante','hs','horario','tu','como',
            'al','con','por','nueva','la','e','os', 'es','su','un','una'}
    text = [w for w in text if not w in stops]    
    text = " ".join(text)
    text = text.replace("."," ").replace(","," ").replace("("," ").replace(")"," ")
    text = text.replace("/"," ").replace("-"," ").replace("!"," ").replace("¡"," ")
    text = text.replace("\""," ").replace("*"," ").replace(">"," ")
    return(text)

In [7]:
avisos_detalle_txt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25288 entries, 0 to 275363
Data columns (total 2 columns):
idaviso        25288 non-null int64
descripcion    25288 non-null object
dtypes: int64(1), object(1)
memory usage: 592.7+ KB


In [8]:
descripciones = avisos_detalle_txt['descripcion']

len(descripciones)

25288

In [9]:
trainText = []
for it in avisos_detalle_txt['descripcion']:
    newT = textClean(it)
    trainText.append(newT)


In [10]:
text = pd.DataFrame(trainText,columns=['descripcion'])
text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25288 entries, 0 to 25287
Data columns (total 1 columns):
descripcion    25288 non-null object
dtypes: object(1)
memory usage: 197.6+ KB


In [11]:
avisos_detalle_txt.reset_index(inplace=True)
avisos_detalle_txt['descripcion'] = text['descripcion']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
avisos_detalle_txt.drop(columns=['index'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [13]:
avisos_detalle_txt.head()

Unnamed: 0,idaviso,descripcion
0,1112448580,laboratorio ubicado san cristobal busqueda tec...
1,1112241594,buscamos cajeros bancoseran responsables reali...
2,1112206367,empresa alimenticia se encuentra busqueda anal...
3,1112290296,compañia financiera local nos encontramos busq...
4,1112397487,adecco office esta especializada el reclutamie...


In [14]:
final = pd.merge(avisos_detalle_txt,modelo, on = 'idaviso',how='outer')

In [15]:
final.dropna(axis=0)
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
idaviso            1000000 non-null int64
descripcion_x      1000000 non-null object
idpostulante       1000000 non-null object
rango_edad         1000000 non-null float64
sexo               1000000 non-null int64
nivel_estudios     1000000 non-null float64
esta_estudiando    1000000 non-null float64
descripcion_y      1000000 non-null object
tipo_de_trabajo    1000000 non-null int64
nivel_laboral      1000000 non-null int64
nombre_zona        1000000 non-null object
nombre_area        1000000 non-null object
sepostulo          1000000 non-null float64
dtypes: float64(4), int64(4), object(5)
memory usage: 106.8+ MB


In [16]:
final.drop(columns=['descripcion_y'],axis=1,inplace=True)
final.rename(columns={'descripcion_x':'descripcion'},inplace=True)
final.to_csv("modelo_final_tfidf.csv", encoding = "utf-8", index = False)