# This is the api to take the input from UI and return a list of EmployeeID

In [17]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle
from autocorrect import spell
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
nltk.download('punkt', 'stopwords')

[nltk_data] Downloading package punkt to stopwords...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
def preprocess(text):
    from nltk.stem import PorterStemmer
    # remove non-alphabetic characters
    textAlphabetic = re.sub('[^A-Za-z]', ' ', text)
    # make all words lower case
    textLower = textAlphabetic.lower()
    # remove stop words
    tokenized_text = word_tokenize(textLower)
    for word in tokenized_text:
        if word in stopwords.words('english'):
            tokenized_text.remove(word)
    # stemming
    stemmer = PorterStemmer()
    for i in range(len(tokenized_text)):
        tokenized_text[i] = stemmer.stem(spell(tokenized_text[i]))

    return tokenized_text

def stem_tokenize(text):
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    return [stemmer.stem(i) for i in word_tokenize(text)]


In [53]:
"""
input: text
Return project ID list given a project ID: string
Return a list.
"""
def predictTopKProject(text, topK = 5, vectorizer = 'countVectorizer.pkl',
            embeddedProject = 'embeddedProject.csv',
            employeeSimMatrix = 'employee_similarity_matrix.csv',
           projectSimMatrix = 'project_similarity_matrix.csv'):
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
    # load project embedding
    embeddedProject = pd.read_csv('embeddedProject.csv',index_col = 'pID')
    # preprocess
    text_preprocessed = preprocess(text)
    with open(vectorizer, 'rb') as f1:
        vectorizer = pickle.load(f1)
    
    text_vectorized = vectorizer.transform([text]).toarray()
    score = []
    for i in range(embeddedProject.shape[0]):
        prior_project = embeddedProject.iloc[i,:]
        score.append(np.corrcoef(text_vectorized, prior_project)[0][1])
    mylist = sorted(enumerate(score), key=lambda x: -x[1])
    idx = [l[0] for l in mylist]
    score_sorted = [l[1] for l in mylist]
    pIds = [embeddedProject.index[ii] for ii in idx]
    if topK == 'all':
        return pIds
    else:
        return pIds[0:topK]

"""
input: a List of pIds
Return employee ID list given a project ID: string
Return a list.
"""
def getEmployeeIDForPid(pId, projectTablePath = r'../data/project_M25_matched.txt'):
    if len(pId) == 0:
        return None
    project = pd.read_csv(projectTablePath,sep = '|', index_col = 'pID')
    project.index = project.index.map(str)
    eIds = []
    for ids in pId:
        eIds = eIds + project.loc[str(ids)]['ProjectTeam'].split(',')
    return list(eIds)

In [54]:
def buildTeam(text):
    # get topK project
    projectIdList = predictTopKProject(text)
    print(projectIdList)
    # get topK employee ID list
    if len(projectIdList) > 0:
        employeeIdList = getEmployeeIDForPid(projectIdList)
        return employeeIdList
    else:
        return []

In [56]:
inputString = 'Tax, Payment and Compliance Solution meeting'
buildTeam(inputString)

[2, 95, 9, 48, 28]


['12069', '12107', '12113', '12220', '12167']

In [57]:
import sys

In [61]:
len(sys.argv)

3