In [1]:
#Import necessary libraries
import pandas as pd 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download as nltk_download
from gensim.models import Word2Vec
import numpy as np
from tensorflow.keras import Model
from tensorflow.keras import Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [2]:
#Load data into memory
socs = pd.read_csv('data/soc.csv', encoding='cp1252')
cips = pd.read_csv('data/cip.csv', encoding='cp1252')
soc_to_cip = pd.read_csv('data/cip_to_soc.csv', encoding='cp1252')

In [3]:
socs.head()

Unnamed: 0,SOC Title,SOC Definition
0,Chief Executives,Determine and formulate policies and provide o...
1,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
2,Legislators,"Develop, introduce, or enact laws and statutes..."
3,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."
4,Marketing Managers,"Plan, direct, or coordinate marketing policies..."


In [4]:
cips.head()

Unnamed: 0,Title,Definition
0,3-D Modeling and Design Technology/Technician.,A program that prepares individuals to apply t...
1,Abdominal Radiology Fellowship Program.,A fellowship training program that prepares di...
2,Accent Reduction/Modification.,A program that focuses on accent modification ...
3,Accounting and Business/Management.,An integrated or combined program in accountin...
4,Accounting and Computer Science.,A program that combines accounting with comput...


In [5]:
soc_to_cip.head()

Unnamed: 0,CIP Title,SOC Title
0,"Agriculture, General.",Animal Scientists
1,"Agriculture, General.",Food Scientists and Technologists
2,"Agriculture, General.",Soil and Plant Scientists
3,"Agriculture, General.",Agricultural Technicians
4,"Agriculture, General.","Agricultural Sciences Teachers, Postsecondary"


In [6]:
#Creating a dictionary containing list of careers given CIP
cip_dictionary = {}
for index, row in soc_to_cip.iterrows():
    if row['CIP Title'] not in cip_dictionary:
        cip_dictionary[row['CIP Title']] = []
    if row['SOC Title'] not in cip_dictionary[row['CIP Title']]:
        cip_dictionary[row['CIP Title']].append(row['SOC Title'])


In [None]:
#Creating dataset for training
dataset = []

#Iterate through each soc code and cip code
for x, row_1 in cips.iterrows():
    for y, row_2 in socs.iterrows():
        cip_desc = row_1['Definition']
        soc_desc = row_2['SOC Definition']
        
        #Only pull in cip codes that 
        if row_1['Title'] in cip_dictionary:
            #If the cip code name and the soc code name are in the crosswalk
            if row_2['SOC Title'] in cip_dictionary[row_1['Title']]:
                #Flag the pairing as a match
                dataset.append([
                    cip_desc,
                    soc_desc,
                    1
                ])
            else:
                #Flag the pairing as not a match
                dataset.append([
                    cip_desc,
                    soc_desc,
                    0
                ])

#Turn matches into data frame
df = pd.DataFrame(dataset, columns = ['CIP Desc','SOC Desc','Match'])

In [8]:
#Create dataset for creating word embeddings
statements = []
for index, row in cips.iterrows():
    statements.append(row['Definition'])
for index, row in socs.iterrows():
    statements.append(row['SOC Definition'])

#Tokenize text for word embedding model
tokenizer = Tokenizer()
tokenizer.fit_on_texts(statements)
text_sequences = tokenizer.texts_to_sequences(statements)
tokenized_statements = [[tokenizer.index_word[idx] for idx in seq] for seq in text_sequences]

#Create word embeddings
w2v_model = Word2Vec(sentences=tokenized_statements, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.save('word2vec.model')

In [9]:
#Create pretrained embedding layer for model
vocab = [word for word in tokenizer.word_index.keys()]
vocab_size = len(vocab) + 1
weight_matrix = np.zeros((vocab_size, w2v_model.vector_size))
for i in range(len(vocab)):
    weight_matrix[i + 1] = w2v_model.wv[vocab[i]]

In [19]:
x1_input = Input(shape=(1,), dtype=tf.string)
x1 = TextVectorization(max_tokens = vocab_size + 1, pad_to_max_tokens = True, vocabulary = vocab)(x1_input)
x1 = Embedding(vocab_size,100,weights = [weight_matrix],trainable = False)(x1)
x1 = Bidirectional(LSTM(20))(x1)
x1 = Flatten()(x1)

x2_input = Input(shape=(1,), dtype=tf.string)
x2 = TextVectorization(max_tokens = vocab_size + 1, pad_to_max_tokens = True, vocabulary = vocab)(x2_input)
x2 = Embedding(vocab_size,100,weights = [weight_matrix],trainable = False)(x2)
x2 = Bidirectional(LSTM(20))(x2)
x2 = Flatten()(x2)

c = Concatenate()([x1, x2])
c = Dense(1000, activation = 'relu')(c)
c = Dropout(0.1)(c)
c = Dense(1, activation = 'sigmoid')(c)

keras_model = Model([x1_input, x2_input], c)
keras_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
text_vectorization_9 (TextVecto (None, None)         0           input_10[0][0]                   
__________________________________________________________________________________________________
text_vectorization_10 (TextVect (None, None)         0           input_11[0][0]                   
____________________________________________________________________________________________

In [39]:
#Splitting data for training and testing
train_x, train_y = [], []
test_x, test_y = [], []

matches = df[df['Match'] == 1]
non_matches = df[df['Match'] == 0]

counter = 0
matches = matches.sample(frac = 1)
for index, row in matches.iterrows():
    if counter < len(matches) * 0.8:
        train_x.append([
            matches['CIP Desc'],
            matches['SOC Desc']
        ])
        train_y.append(matches['Match'])
    else:
        test_x.append([
            matches['CIP Desc'],
            matches['SOC Desc']
        ])
        test_y.append(matches['Match'])
    counter += 1

counter = 0
non_matches = non_matches.sample(n = len(matches))
for index, row in non_matches.iterrows():
    if counter < len(matches) * 0.8:
        train_x.append([
            non_matches['CIP Desc'],
            non_matches['SOC Desc']
        ])
        train_y.append(non_matches['Match'])
    else:
        test_x.append([
            non_matches['CIP Desc'],
            non_matches['SOC Desc']
        ])
        test_y.append(non_matches['Match'])
    counter += 1

In [41]:
keras_model.compile(
    optimizer = Adam(),
    loss = BinaryCrossentropy(),
    metrics = [
        AUC(),
        BinaryAccuracy()
    ]
)

9400


In [None]:
keras_model.fit(
    x = train_x,
    y = train_y,
    epochs = 10,
    validation_data = (test_x, test_y)
)