In [82]:
import re
import spacy
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import json

In [104]:
#Function to extract names from the string using spacy
def extract_name(string):
    names = []
    nlp = spacy.load('en')
    doc = nlp(string)
    ent_list = ['PERSON','GPE']
    for ent in doc.ents:
        if(ent.label_ in ent_list):
            #print(ent.text, ent.label_)
            names.append(ent.text)
    return np.unique(names)

In [105]:
def extract_org(string):
    org = []
    nlp = spacy.load('en')
    doc = nlp(string)
    ent_list = ['NORP','ORG','LOC','PRODUCT']
    for ent in doc.ents:
        if(ent.label_ in ent_list):
            #print(ent.text, ent.label_)
            org.append(ent.text)
    return np.unique(org)

In [85]:
#Function to extract Phone Numbers from string using regular expressions
def extract_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = np.unique(r.findall(string))
    return [re.sub(r'\D', '', number) for number in phone_numbers]

In [86]:
#Function to extract Email address from a string using regular expressions
def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return np.unique(r.findall(string))

In [87]:
def extract_twitter(string):
    r = re.compile(r'twitter.com/([^\s:]+)')
    return r.findall(string)

In [88]:
def extract_linkedin(string):
    r = re.compile(r'linkedin.com/([^\s:]+)')
    return r.findall(string)

In [89]:
def extract_facebook(string):
    r = re.compile(r'facebook.com/([^\s:]+)')
    return r.findall(string)

In [90]:
#load email body
with open('courtnay samples/sample4.txt', 'r') as fp:
    content = fp.read()

In [91]:
#Removing commas in the resume for an effecient check
soup = BeautifulSoup(content, 'html.parser')
content = soup.get_text()
string = content.replace(',',' ')
string = string.replace('\n',' ')
string = string.replace('\r',' ')
string = string.replace('\\r\\n',' ')
#Converting all the charachters in lower case
string1 = string.lower()

In [115]:
names= extract_name(string1)
org = extract_org(string1).tolist()
emails = extract_email_addresses(string1).tolist()
phone_num = extract_phone_numbers(string1)
twitter = extract_twitter(string1)
linkedin = extract_linkedin(string1)
facebook = extract_facebook(string1)

In [116]:
contact_list = []
name_track = []
email_track = []
phone_track = []
twitter_track = []
linkedin_track = []
facebook_track = []
other_contacts = []

In [117]:
for i in names:
    for j in emails:
        if i in j:
            contact = {'name':i,'email':j}
            contact_list.append(contact)
            name_track.append(i)
            email_track.append(j)

In [118]:
contact_list

[{'name': 'courtenay', 'email': 'courtenay.farquharson@gmail.com'},
 {'name': 'courtenay', 'email': 'courtenay@parserr.com'}]

In [119]:
for i in names:
    if i not in name_track:
        contact = {'name':i,'email':''}
        other_contacts.append(contact)
for i in emails:
    if i not in email_track:
        contact = {'name':'','email':i}
        other_contacts.append(contact)

In [120]:
other_contacts

[{'name': 'ali', 'email': ''},
 {'name': 'mohammed', 'email': ''},
 {'name': '', 'email': 'asmsmtap@microsoft.com'},
 {'name': '', 'email': 'asmsmtbp@microsoft.com'},
 {'name': '', 'email': 'asmsmtbp@microsoft.com.'},
 {'name': '', 'email': 'farqy1@hotmail.com'},
 {'name': '', 'email': 'support@mail.support.microsoft.com'},
 {'name': '', 'email': 'v-elmoha@microsoft.com'},
 {'name': '', 'email': 'v-moimt@microsoft.com'},
 {'name': '', 'email': 'v-naskai@microsoft.com'},
 {'name': '', 'email': 'v-sajhin@microsoft.com'},
 {'name': '', 'email': 'v-vijaba@microsoft.com'}]

In [121]:
response = {'contact': contact_list,
           'other_contact': other_contacts,
           'organisation': org,
           'phone number': phone_num,
           'twitter handle': twitter,
           'linkedin handle': linkedin,
           'facebook handle': facebook}

In [122]:
response

{'contact': [{'name': 'courtenay', 'email': 'courtenay.farquharson@gmail.com'},
  {'name': 'courtenay', 'email': 'courtenay@parserr.com'}],
 'other_contact': [{'name': 'ali', 'email': ''},
  {'name': 'mohammed', 'email': ''},
  {'name': '', 'email': 'asmsmtap@microsoft.com'},
  {'name': '', 'email': 'asmsmtbp@microsoft.com'},
  {'name': '', 'email': 'asmsmtbp@microsoft.com.'},
  {'name': '', 'email': 'farqy1@hotmail.com'},
  {'name': '', 'email': 'support@mail.support.microsoft.com'},
  {'name': '', 'email': 'v-elmoha@microsoft.com'},
  {'name': '', 'email': 'v-moimt@microsoft.com'},
  {'name': '', 'email': 'v-naskai@microsoft.com'},
  {'name': '', 'email': 'v-sajhin@microsoft.com'},
  {'name': '', 'email': 'v-vijaba@microsoft.com'}],
 'organisation': ["b'----------",
  'message\\xe2\\x80\\x9d',
  'v-elmoha@microsoft.com',
  'v-naskai@microsoft.com'],
 'phone number': ['1190504250'],
 'twitter handle': [],
 'linkedin handle': [],
 'facebook handle': []}

In [123]:
json_resp = json.dumps(response)

In [124]:
json_resp

'{"contact": [{"name": "courtenay", "email": "courtenay.farquharson@gmail.com"}, {"name": "courtenay", "email": "courtenay@parserr.com"}], "other_contact": [{"name": "ali", "email": ""}, {"name": "mohammed", "email": ""}, {"name": "", "email": "asmsmtap@microsoft.com"}, {"name": "", "email": "asmsmtbp@microsoft.com"}, {"name": "", "email": "asmsmtbp@microsoft.com."}, {"name": "", "email": "farqy1@hotmail.com"}, {"name": "", "email": "support@mail.support.microsoft.com"}, {"name": "", "email": "v-elmoha@microsoft.com"}, {"name": "", "email": "v-moimt@microsoft.com"}, {"name": "", "email": "v-naskai@microsoft.com"}, {"name": "", "email": "v-sajhin@microsoft.com"}, {"name": "", "email": "v-vijaba@microsoft.com"}], "organisation": ["b\'----------", "message\\\\xe2\\\\x80\\\\x9d", "v-elmoha@microsoft.com", "v-naskai@microsoft.com"], "phone number": ["1190504250"], "twitter handle": [], "linkedin handle": [], "facebook handle": []}'