In [1]:
import re
import spacy
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import json

In [2]:
#Function to extract names from the string using spacy
def extract_name(string):
    names = []
    nlp = spacy.load('en')
    #nlp = spacy.load('en_core_web_sm')
    doc = nlp(string)
    ent_list = ['PERSON']
    for ent in doc.ents:
        if(ent.label_ in ent_list):
            #print(ent.text, ent.label_)
            names.append(ent.text)
    return np.unique(names)

In [3]:
def extract_org(string):
    org = []
    nlp = spacy.load('en')
    doc = nlp(string)
    ent_list = ['GPE','NORP','ORG','LOC','PRODUCT']
    for ent in doc.ents:
        if(ent.label_ in ent_list):
            #print(ent.text, ent.label_)
            org.append(ent.text)
    return np.unique(org)

In [4]:
#Function to extract Phone Numbers from string using regular expressions
def extract_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = np.unique(r.findall(string))
    return [re.sub(r'\D', '', number) for number in phone_numbers]

In [5]:
#Function to extract Email address from a string using regular expressions
def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return np.unique(r.findall(string))
    #return r.findall(string)

In [6]:
def extract_twitter(string):
    r = re.compile(r'twitter.com/([^\s:]+)')
    return r.findall(string)

In [7]:
def extract_linkedin(string):
    r = re.compile(r'linkedin.com/([^\s:]+)')
    return r.findall(string)

In [8]:
def extract_facebook(string):
    r = re.compile(r'facebook.com/([^\s:]+)')
    return r.findall(string)

In [9]:
#load email body
with open('courtnay samples/sample4.txt', 'r') as fp:
    content = fp.read()

In [10]:
#Removing commas in the resume for an effecient check
#soup = BeautifulSoup(content, 'html.parser')
#content = soup.get_text()
string = content.replace(',',' ')
string = string.replace('\n',' ')
string = string.replace('\r',' ')
string = string.replace('\\r\\n',' ')
string1 = string.replace('*','')
#Converting all the charachters in lower case
#string1 = string.lower()

In [12]:
names= extract_name(string1)
org = extract_org(string1).tolist()
emails = extract_email_addresses(string1).tolist()
#emails = extract_email_addresses(string1)
phone_num = extract_phone_numbers(string1)
twitter = extract_twitter(string1)
linkedin = extract_linkedin(string1)
facebook = extract_facebook(string1)

In [13]:
contact_list = []
name_track = []
email_track = []
phone_track = []
twitter_track = []
linkedin_track = []
facebook_track = []
other_contacts = []

In [14]:
contact_name = []
contact_num = []
contact_email = []
for i in range(len(emails)):
    name = []
    start_pos = []
    for match in re.finditer(emails[i], string1):
        start_pos.append(match.start())
    for j in range(len(start_pos)):
        end_pos = start_pos[j] + len(emails[i])
        name = extract_name(string1[start_pos[j]-50:start_pos[j]])
        if len(name) > 0:
            #contact = {'name':str(name),'email':emails[i]}
            #contact_test.append(contact)
            if (len(extract_phone_numbers(string1[start_pos[j]-100:start_pos[j]]))>0):
                contact_name.append(str(name))
                contact_email.append(emails[i])
                contact_num.append(extract_phone_numbers(string1[start_pos[j]-100:start_pos[j]]))
            elif (len(extract_phone_numbers(string1[end_pos:end_pos + 150]))>0):
                contact_name.append(str(name))
                contact_email.append(emails[i])
                contact_num.append(extract_phone_numbers(string1[end_pos:end_pos + 150]))
            else:
                contact_name.append(str(name))
                contact_email.append(emails[i])
                contact_num.append('null')

In [15]:
contact_list_new = pd.DataFrame()
contact_list_new['contact_name'] = contact_name
contact_list_new['contact_email'] = contact_email
contact_list_new['contact_num'] = contact_num

In [16]:
contact_list_new.drop_duplicates(subset='contact_email')

Unnamed: 0,contact_name,contact_email,contact_num
0,['Courtenay Farquharson'],courtenay.farquharson@gmail.com,[1190504250]
3,['Courtenay Farquharson'],courtenay@parserr.com,
4,['Elyaas Mohammed'],v-elmoha@microsoft.com,
7,['Mohammed Imtiyaz Ali'],v-moimt@microsoft.com,
8,['Nasmin Kaisar'],v-naskai@microsoft.com,[1190504250]
13,['Saikumar Jhingade'],v-sajhin@microsoft.com,[1190504250]


In [17]:
contact_list = contact_list_new.drop_duplicates(subset='contact_email').reset_index()

In [18]:
name_track = contact_list['contact_name']
email_track = contact_list['contact_email']
for i in names:
    if i not in name_track:
        contact = {'name':i,'email':''}
        other_contacts.append(contact)
for i in emails:
    if i not in email_track:
        contact = {'name':'','email':i}
        other_contacts.append(contact)

In [20]:
contact_list = contact_list_new.drop_duplicates(subset='contact_email').reset_index().to_dict('index')

In [21]:
response = {'contact': contact_list,
           #'other_contact': other_contacts,
           'organisation': org,
           'phone number': phone_num,
           'twitter handle': twitter,
           'linkedin handle': linkedin,
           'facebook handle': facebook}

In [22]:
response

{'contact': {0: {'index': 0,
   'contact_name': "['Courtenay Farquharson']",
   'contact_email': 'courtenay.farquharson@gmail.com',
   'contact_num': ['1190504250']},
  1: {'index': 3,
   'contact_name': "['Courtenay Farquharson']",
   'contact_email': 'courtenay@parserr.com',
   'contact_num': 'null'},
  2: {'index': 4,
   'contact_name': "['Elyaas Mohammed']",
   'contact_email': 'v-elmoha@microsoft.com',
   'contact_num': 'null'},
  3: {'index': 7,
   'contact_name': "['Mohammed Imtiyaz Ali']",
   'contact_email': 'v-moimt@microsoft.com',
   'contact_num': 'null'},
  4: {'index': 8,
   'contact_name': "['Nasmin Kaisar']",
   'contact_email': 'v-naskai@microsoft.com',
   'contact_num': ['1190504250']},
  5: {'index': 13,
   'contact_name': "['Saikumar Jhingade']",
   'contact_email': 'v-sajhin@microsoft.com',
   'contact_num': ['1190504250']}},
 'organisation': ['Azure',
  'Azure Announcements',
  'Azure Subscription Management Support',
  'Azure Support',
  'Billing & Payment',
  'C

In [23]:
json_resp = json.dumps(response)

In [24]:
json_resp

'{"contact": {"0": {"index": 0, "contact_name": "[\'Courtenay Farquharson\']", "contact_email": "courtenay.farquharson@gmail.com", "contact_num": ["1190504250"]}, "1": {"index": 3, "contact_name": "[\'Courtenay Farquharson\']", "contact_email": "courtenay@parserr.com", "contact_num": "null"}, "2": {"index": 4, "contact_name": "[\'Elyaas Mohammed\']", "contact_email": "v-elmoha@microsoft.com", "contact_num": "null"}, "3": {"index": 7, "contact_name": "[\'Mohammed Imtiyaz Ali\']", "contact_email": "v-moimt@microsoft.com", "contact_num": "null"}, "4": {"index": 8, "contact_name": "[\'Nasmin Kaisar\']", "contact_email": "v-naskai@microsoft.com", "contact_num": ["1190504250"]}, "5": {"index": 13, "contact_name": "[\'Saikumar Jhingade\']", "contact_email": "v-sajhin@microsoft.com", "contact_num": ["1190504250"]}}, "organisation": ["Azure", "Azure Announcements", "Azure Subscription Management Support", "Azure Support", "Billing & Payment", "Courtenay", "Elyaas", "MINDTREE", "Microsoft", "Mic