In [933]:
import json
import os
import pandas as pd
import numpy as np

Read english job ads

In [934]:
df = pd.read_json('/Users/cwestrup/thesis/data/sanoma/oikotie/jobs-en-2015-12-04T11:45:53.720971.json')

Select name and job_description

In [935]:
df = df.loc[:,['name','job_description']]
df.columns = ['title', 'content']
df.head()

Unnamed: 0,title,content
0,"Sandbox Software Engineer (ATP), Backend autom...",Coders needed<br />\nWe are looking for passio...
1,"Channel Manager, Cyber Security Services Nordics",F-Secure offers an exciting and challenging po...
2,Experienced Software Engineer in Test (ATP),Coders needed<br />\nWe’re looking for softwar...
3,"Experienced Software Engineer (ATP), Network S...",We’re looking for software engineers who can d...
4,Big Data & Machine Learning Software Engineer ...,We are looking for passionate Software Enginee...


Write out all jobs with names and full description text

In [936]:
df.index.name = 'ad_id'
df.to_csv('./data/ads.csv')

### Creating text chunks (paragraphs)

Select job descriptions

In [937]:
df_chunks = df.loc[:,['content']]

Replace '\n' by html br

In [938]:
to_spaces = '\n'
df_chunks = df_chunks.replace({'\n': '<br>'}, regex=True)

Replace html 'br' variants with 'BREAK'

In [939]:
to_paragraph_breaks = '(<br />|<br/>|<br>|</br>){1,}'
df_chunks = df_chunks.replace(to_replace=to_paragraph_breaks, value='##BREAK##', regex=True)

Add break after html p tags

In [940]:
to_paragraph_breaks = '</p>|</ p>'
df_chunks = df_chunks.replace(to_replace=to_paragraph_breaks, value='</p>##BREAK##', regex=True)

Remove all multiple occurences of BREAK tag

In [941]:
df_chunks = df_chunks.replace(to_replace='(##BREAK##)+', value='##BREAK##', regex=True)

Strip out all html tags

In [942]:
df_chunks = df_chunks.replace(to_replace='<[^<]+?>|&nbsp;', value='', regex=True)

Split rows by BREAK tags and stack up to a new DataFrame

In [943]:
df_chunks = df_chunks['content'].str.split('##BREAK##').apply(pd.Series)
df_chunks = df_chunks.stack().to_frame()
df_chunks.columns = ['content']

Remove empty rows:

In [944]:
df_chunks.size

178594

In [945]:
df_chunks['content'].replace('', np.nan, inplace=True)
df_chunks.dropna(subset=['content'], inplace=True)

In [946]:
df_chunks.size

138166

Rename the indices and columns for the csv

In [947]:
df_chunks = df_chunks.reset_index()

In [948]:
df_chunks.index.name = 'chunk_id'
df_chunks.columns = ['ad_id', 'drop', 'content']
df_chunks = df_chunks.drop('drop', 1)

In [949]:
df_chunks.head(40)

Unnamed: 0_level_0,ad_id,content
chunk_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,Coders needed
1,0,We are looking for passionate Software Enginee...
2,0,What’s in it for me?
3,0,"As part of the ATP team, you’ll be pioneering ..."
4,0,How to stand out as an applicant?
5,0,The following experience is an absolute must:
6,0,• Extensive prior experience with hypervisors ...
7,0,• We value prior experience integrating the ab...
8,0,"• Understanding of quality practices, continuo..."
9,0,• We value experience working with public or p...


In [950]:
df_chunks.to_csv('./data/chunks.csv')

Import into MongoDB with:

     mongoimport -d thesis -c ads --type csv --file ./data/alljobs.csv --headerline
     mongoimport -d thesis -c chunks --type csv --file ./data/chunks.csv --headerline