In [1]:
import pandas as pd
import re
import json

# data cleaning for resumes.

* a. Trim and clean the text of special characters.
* b. Using breaks, commas or spaces, attempt to separate the annotations into individual skills

## Read data

In [34]:
resume_data = pd.read_json("Entity Recognition in Resumes.json", lines=True, orient="records")
resume_data.head()

Unnamed: 0,content,annotation,extras
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...",


In [44]:
resume_data = resume_data.dropna(axis=1).dropna(axis=1)

In [20]:
resume_data["annotation"][0][:2]

[{'label': ['Skills'],
  'points': [{'start': 1295,
    'end': 1621,
    'text': '\n• Programming language: C, C++, Java\n• Oracle PeopleSoft\n• Internet Of Things\n• Machine Learning\n• Database Management System\n• Computer Networks\n• Operating System worked on: Linux, Windows, Mac\n\nNon - Technical Skills\n\n• Honest and Hard-Working\n• Tolerant and Flexible to Different Situations\n• Polite and Calm\n• Team-Player'}]},
 {'label': ['Skills'],
  'points': [{'start': 993,
    'end': 1153,
    'text': 'C (Less than 1 year), Database (Less than 1 year), Database Management (Less than 1 year),\nDatabase Management System (Less than 1 year), Java (Less than 1 year)'}]}]

In [22]:
txt = resume_data["content"][0]
txt

"Abhishek Jha\nApplication Development Associate - Accenture\n\nBengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a\n\n• To work for an organization which provides me the opportunity to improve my skills\nand knowledge for my individual and company's growth in best possible ways.\n\nWilling to relocate to: Bangalore, Karnataka\n\nWORK EXPERIENCE\n\nApplication Development Associate\n\nAccenture -\n\nNovember 2017 to Present\n\nRole: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries\nfor the Bot which will be triggered based on given input. Also, Training the bot for different possible\nutterances (Both positive and negative), which will be given as\ninput by the user.\n\nEDUCATION\n\nB.E in Information science and engineering\n\nB.v.b college of engineering and technology -  Hubli, Karnataka\n\nAugust 2013 to June 2017\n\n12th in Mathematics\n\nWoodbine modern school\n\nApril 2011 to March 2013\n\n10th\n\nKendriya Vidyalaya\n\

In [23]:
txt[1295:1622]

'\n• Programming language: C, C++, Java\n• Oracle PeopleSoft\n• Internet Of Things\n• Machine Learning\n• Database Management System\n• Computer Networks\n• Operating System worked on: Linux, Windows, Mac\n\nNon - Technical Skills\n\n• Honest and Hard-Working\n• Tolerant and Flexible to Different Situations\n• Polite and Calm\n• Team-Player'

## a. Trim and clean the text of special characters.

In [32]:
def clean_text(txt):
    "Trim and clean the text of special characters"
    txt = re.sub("[^a-zA-Z0-9]+", " ", txt)
    txt = re.sub("\s+", " ", txt)
    txt = txt.strip()
    return txt


In [33]:
clean_text(txt)

'Abhishek Jha Application Development Associate Accenture Bengaluru Karnataka Email me on Indeed indeed com r Abhishek Jha 10e7a8cb732bc43a To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company s growth in best possible ways Willing to relocate to Bangalore Karnataka WORK EXPERIENCE Application Development Associate Accenture November 2017 to Present Role Currently working on Chat bot Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input Also Training the bot for different possible utterances Both positive and negative which will be given as input by the user EDUCATION B E in Information science and engineering B v b college of engineering and technology Hubli Karnataka August 2013 to June 2017 12th in Mathematics Woodbine modern school April 2011 to March 2013 10th Kendriya Vidyalaya April 2001 to March 2011 SKILLS C Less than 1 year Database Less than 1 year Databa

In [47]:
resume_data["clean_content"] = resume_data["content"].apply(clean_text)

## b. Using breaks, commas or spaces, attempt to separate the annotations into individual skills

In [92]:
def identify_skills(case):
    """identify skills in one point in annotation"""
    skill_set = [y.strip() for y in re.split("[\n,;]", case) if y.strip()!=""]
    for i, skill in enumerate(skill_set):
        if len(skill.split(":")) == 2:
            skill_set[i] = skill.split(":")[1]    # remove headers
        skill_set[i] = re.sub("\(.+\)", "", skill_set[i])    # remove additional information inside brackets
        skill_set[i] = clean_text(skill_set[i])
        skill_set[i] = re.sub("[0-9]+", "", skill_set[i])    # remove numerics
        skill_set[i] = skill_set[i].lower()
    skill_set = [x.strip() for x in skill_set if x.strip()!=""]
    return list(set(skill_set))


In [121]:
annotation = resume_data["annotation"][1]
skill_annots = [x for x in annotation if 'Skills' in x['label']]
testcase2 = skill_annots[0]['points'][0]['text']
identify_skills(testcase2)

['linux',
 'microsoft windows',
 'additional information',
 'database',
 'mysql',
 'php',
 'ms access',
 'net',
 'oracle g',
 'microsoft',
 'c',
 'xml',
 'access',
 'ms sql server',
 'java',
 'windows windows server',
 'html']

In [118]:
def process_all_skills(annotation):
    """identify points with skill label and process each set"""
    skill_annots = [x for x in annotation if 'Skills' in x['label']]
    skill_set = []
    for skill_point in skill_annots:
        for point in skill_point["points"]:
            skill_text = point["text"]
            temp = identify_skills(skill_text)
            skill_set.extend(temp)
    return list(set(skill_set))


In [119]:
process_all_skills(resume_data["annotation"][0])

['computer networks',
 'linux',
 'oracle peoplesoft',
 'tolerant and flexible to different situations',
 'mac',
 'windows',
 'honest and hard working',
 'team player',
 'database',
 'machine learning',
 'internet of things',
 'non technical skills',
 'polite and calm',
 'c',
 'database management',
 'java',
 'database management system']

In [114]:
resume_data["skills"] = resume_data["annotation"].apply(process_all_skills)

In [117]:
resume_data.head()

Unnamed: 0,content,annotation,clean_content,skills
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...",Abhishek Jha Application Development Associate...,"[computer networks, linux, oracle peoplesoft, ..."
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...",Afreen Jamadar Active member of IIIT Committee...,"[linux, microsoft windows, mysql, database, ph..."
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...",Akhil Yadav Polemaina Hyderabad Telangana Emai...,"[teradata, cobol, mainframe, jcl, servicenow]"
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...",Alok Khandai Operational Analyst SQL DBA Engin...,"[ms reporting services, business, sas, sql dba..."
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...",Ananya Chavan lecturer oracle tutorials Mumbai...,"[filezilla, sem, servlet, ms access, spring s..."


In [122]:
resume_data.to_pickle("resume_data.pkl")