## NLP Analysis of Data Science Jobs in Ireland


In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os 

In [2]:
# data path
DATA_PATH = './data'
# last items in list
LAST_ITEMS = -8
FIRST_ITEMS = 3
with open(f"{DATA_PATH}/1_Research Scientist, Machine LearningDeep Learning_Dataminr.txt", 'r') as file:
    content = file.read()
    

In [3]:
def read_file(file):
    """
        Reads a file from the from the directory
        Returns: list of sentences
    """
    if file == ".DS_Store":
        pass
    try:
        with open(f"{DATA_PATH}/{file}", 'r') as f:
            content = f.read()
        return content.split('\n')
    except:
        print(f"unable to read: {file} ")

In [4]:
def get_title(sentences):
    """
        get the job title and the company name
    """
    title = sentences[0]
    company = sentences[1]
    location = sentences[2]
    
    return title, company, location

In [5]:
def job_metadata(sentences):
    
    """
        The last 8 items of the sentences list are Seniority level ...
        reutrns: a dictionary of level:value
    """
    metadata_dict = {}
    keys = []
    values = []
    data = sentences[LAST_ITEMS:]
    for i in range(len(data)):
        if i % 2 == 0:
            keys.append(data[i])
        else:
            values.append(data[i])
#     for key, value in zip(keys,values):
#         metadata_dict[key] = value
    
    return list(zip(keys,values))
            

In [6]:
def get_text(sentences):
    text = sentences[FIRST_ITEMS:LAST_ITEMS]
    return text

In [7]:
job_df = pd.DataFrame(columns=["ids", "title", "company", "location",
                 "text", "seniority_level", "industry",
                 "employment_type", "job_function"])
# job_df.columns = ["ids", "title", "position", "location",
#                  "text", "seniority_level", "industry",
#                  "employment_type", "job_function"]

In [8]:
for file in os.listdir(DATA_PATH):
    if file.endswith('.DS_Store'):
        continue
    
    #print(file)
    ids = file.split("_")[0]
    sentences = read_file(file)
    title, company, location = get_title(sentences)
    details = get_text(sentences)
    meta_data = job_metadata(sentences)
    
    job_df = job_df.append(
        {"ids":ids,
         "title": title,
         "company": company,
         "location": location,
         "text": details,
         "seniority_level":meta_data[0][1],
         "industry":meta_data[1][1],
         "employment_type":meta_data[2][1],
         "job_function": meta_data[3][1]},
        ignore_index=True
    
    )
    
    

In [9]:
job_df

Unnamed: 0,ids,title,company,location,text,seniority_level,industry,employment_type,job_function
0,56,Software Engineer,Smartbox Group,"Dublin, County Dublin, Ireland","[Department: Technology, , Reporting to: Softw...",Associate,"Leisure, Travel & Tourism Hospitality",Full-time,Engineering Information Technology
1,21,Data Analyst,Johnson & Johnson,"Cork, County Cork, Ireland","[Data Analyst, Technical Operations (Global), ...",Entry level,Medical Device Financial Services Pharmaceuticals,Full-time,Information Technology
2,34,Senior Data Engineer,Vela Games,"Dublin, County Dublin, Ireland",[Vela Games is looking for a talented Senior D...,Associate,Design Entertainment Computer Games,Full-time,Engineering
3,106,Data Analyst,Dabster,"Cork, County Cork, Ireland","[Permanent Position, , Cork , Ireland, , Dutie...",Entry level,Information Technology & Services Computer Sof...,Full-time,Information Technology
4,51,AIB - Data Engineer – Data and Analytics,AIB,Ireland,"[Posted by, Caroline Donlon 2nd, Senior Busine...",Mid-Senior level,Banking,Full-time,Information Technology
...,...,...,...,...,...,...,...,...,...
136,66,Data Analyst (Marketing),Azon Recruitment Group,"Dublin, County Dublin, Ireland","[Posted by, Laura Murphy 2nd, Recruitment Cons...",Entry level,Financial Services Information Technology & Se...,Full-time,Analyst Information Technology Marketing
137,71,Data Engineer,Verizon Connect,"Dublin City, County Dublin, Ireland","[Posted by, Ian Cummins 2nd, Senior Technical ...",Mid-Senior level,Information Technology & Services,Full-time,Information Technology
138,17,Data Engineer - AI Analytics - Cork,Jefferson Frank,"Cork, County Cork, Ireland","[Data Engineer, , Cork - Flexible Remote Worki...",Entry level,Information Technology & Services Staffing & R...,Full-time,Information Technology
139,29,Data Scientist - Remote,Reperio Human Capital,"Cork, County Cork, Ireland","[(Artificial Intelligence), , Exciting new opp...",Entry level,Information Technology & Services Computer Sof...,Full-time,Engineering Information Technology


In [10]:
job_df.to_csv("./text_to_df/job_df.csv", index=False)