In [1]:
import pandas as pd
import re 
import spacy 
import os 
import numpy as np
import altair as alt
import textdistance
from sklearn.preprocessing import MultiLabelBinarizer
alt.data_transformers.enable('json')
pd.options.display.max_colwidth = 100

In [2]:
nlp = spacy.load("en_core_web_sm")

In [7]:
df_work_title = pd.read_csv("../../../Glentel Inc/HR Analytics - Documents/Capstone Data/ubc_mds_team_share/06_clean_data/work_title.csv")
df_work_title.columns = df_work_title.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
df_work_title = df_work_title.drop(['unnamed:_0', 'employee_name', 'administrative_jobtitle', 'assistant_manager_jobtitle',
       'blue_collar_jobtitle', 'cashier_jobtitle', 'cook_jobtitle',
       'customer_service_representative_jobtitle', 'driver_jobtitle',
       'education_jobtitle', 'financial_services_jobtitle',
       'fitness_sports_jobtitle', 'manager_jobtitle', 'no_work_title',
       'other_jobtitle', 'sales_associate_jobtitle', 'server_jobtitle',
       'technicians_jobtitle', 'telemarketers_jobtitle', 'work7_title_label'], axis=1)

In [9]:
df_work_title

Unnamed: 0,employee_code,work1_title_label,work2_title_label,work3_title_label,work4_title_label,work5_title_label,work6_title_label
0,BDP,sales associate,sales associate,sales associate,not_specified,not_specified,not_specified
1,F6Q,sales associate,customer service representative,not_specified,not_specified,not_specified,not_specified
2,MRN,sales associate,cashier,not_specified,not_specified,not_specified,not_specified
3,MXO,manager,fitness/sports,administrative,sales associate,not_specified,not_specified
4,N0Y,sales associate,sales associate,customer service representative,fitness/sports,not_specified,not_specified
...,...,...,...,...,...,...,...
284,MUD,technicians,education,manager,administrative,not_specified,not_specified
285,L46,financial services,sales associate,administrative,not_specified,not_specified,not_specified
286,N87,other_jobtitle,manager,customer service representative,not_specified,not_specified,not_specified
287,ND9,cashier,other_jobtitle,sales associate,sales associate,not_specified,not_specified


In [10]:
df_work_title.head(1)

Unnamed: 0,employee_code,work1_title_label,work2_title_label,work3_title_label,work4_title_label,work5_title_label,work6_title_label
0,BDP,sales associate,sales associate,sales associate,not_specified,not_specified,not_specified


In [11]:
df_work_exp = pd.read_csv("../../../Glentel Inc/HR Analytics - Documents/Capstone Data/ubc_mds_team_share/06_clean_data/work_experience.csv")
df_work_exp.columns = df_work_exp.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
df_work_exp = df_work_exp.drop(['work1_company', 'work2_company','work3_company', 'work4_company', 
                                'work5_company', 'work6_company', 'work7_company'], axis=1)

In [12]:
df_work_exp_time = pd.merge(df_work_exp, df_work_title, on='employee_code')

In [13]:
df_work_exp_time = df_work_exp_time[['employee_code', 'work1_length', 'work1_title_label', 'work2_length', 'work2_title_label', 'work3_length', 'work3_title_label', 'work4_length', 'work4_title_label', 'work5_length', 'work5_title_label', 'work6_length', 'work6_title_label']]
df_work_exp_time = df_work_exp_time.fillna(0)

In [14]:
df_work_exp_time['work2_title_label'].unique()

array(['sales associate', 'customer service representative', 'cashier',
       'fitness/sports', 'telemarketers', 'manager', 'other_jobtitle',
       'assistant manager', 'education', 'technicians', 'not_specified',
       'server', 'blue collar', 'administrative', 'financial services',
       'driver'], dtype=object)

In [15]:
for i in range(1, 7):
    df_work_exp_time['work'+str(i)+'_length'] = np.abs(df_work_exp_time['work'+str(i)+'_length'])

In [16]:
sales_exp = np.zeros(df_work_exp_time.shape[0])
customer_ser_exp = np.zeros(df_work_exp_time.shape[0])
leader_ship_exp = np.zeros(df_work_exp_time.shape[0])

for i in range(0, df_work_exp_time.shape[0]):
    for j in range(1,7):
        if df_work_exp_time['work'+str(j)+'_title_label'][i] == 'sales associate':
            sales_exp[i] += df_work_exp_time['work'+str(j)+'_length'][i]
        if any(x in df_work_exp_time['work'+str(j)+'_title_label'][i] for x in ['service', 'server']):
            customer_ser_exp[i] += df_work_exp_time['work'+str(j)+'_length'][i]
        if any(x in df_work_exp_time['work'+str(j)+'_title_label'][i] for x in ['manager']):
               leader_ship_exp[i] += df_work_exp_time['work'+str(j)+'_length'][i]

In [17]:
df_work_exp_time['sales_exp_months'] = sales_exp
df_work_exp_time['customer_serv_exp_months'] = customer_ser_exp
df_work_exp_time['leader_ship_exp_months'] = leader_ship_exp

In [18]:
df_work_exp_time = df_work_exp_time.drop(['work1_length', 'work1_title_label', 'work2_length',
       'work2_title_label', 'work3_length', 'work3_title_label',
       'work4_length', 'work4_title_label', 'work5_length',
       'work5_title_label', 'work6_length', 'work6_title_label'], axis=1)

In [19]:
df_work_exp_time.to_csv('../../../Glentel Inc/HR Analytics - Documents/Capstone Data/ubc_mds_team_share/06_clean_data/sales_custom_exp.csv')