In [1]:
import os
import numpy as np
import pandas as pd
#from keras.layers import Activation, Conv1D, Dense, Embedding, Flatten, Input, MaxPooling1D
#from keras.models import Sequential
#from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
#from keras.metrics import categorical_accuracy

df = pd.read_csv('merged_rates/rates_analysis.csv').drop('Unnamed: 0', axis=1).rename(columns={'responsibility':'functional_responsibility'})
df.head()

Unnamed: 0,role,2018_2019,2019_2020,2020_2021,2021_2022,2022_2023,2023_2024,2024_2025,education,functional_responsibility,...,2021_2022_min,2022_2023_min,2023_2024_min,2024_2025_min,1819_1920_change,1920_2021_change,2021_2122_change,2122_2223_change,2223_2324_change,2324_2425_change
0,Senior Program Manager,442.04,454.86,468.05,481.62,,,,Bachelor's,The Senior Program Manager has overall account...,...,481.62,,,,0.029002,0.028998,0.028993,,,
1,Program Manager,389.42,400.71,412.33,424.29,,,,Bachelor's,Program Managers plan and manage projects to c...,...,74.81,74.81,74.81,,0.028992,0.028999,0.029006,,,
2,Project Manager,252.85,260.18,267.73,275.49,,,,Bachelor's,"The Project Manager manages, plans and coordin...",...,67.63,67.63,67.63,134.58,0.02899,0.029018,0.028984,,,
3,Task Manager,190.81,196.34,202.03,207.89,,,,Bachelor's,Task Managers apply their broad management ski...,...,207.89,,,,0.028982,0.02898,0.029006,,,
4,Subject Matter Expert 1,225.85,232.4,239.14,246.08,,,,Bachelor's,The Subject Matter Expert 1 has industry exper...,...,246.08,,,,0.029002,0.029002,0.029021,,,


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8005 entries, 0 to 8004
Data columns (total 41 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   role                       8004 non-null   object 
 1   2018_2019                  7882 non-null   float64
 2   2019_2020                  7977 non-null   float64
 3   2020_2021                  8005 non-null   float64
 4   2021_2022                  7961 non-null   float64
 5   2022_2023                  381 non-null    float64
 6   2023_2024                  272 non-null    float64
 7   2024_2025                  28 non-null     float64
 8   education                  607 non-null    object 
 9   functional_responsibility  8005 non-null   object 
 10  yoe                        607 non-null    object 
 11  company                    8004 non-null   object 
 12  socio_economic             471 non-null    object 
 13  program                    0 non-null      float

In [3]:
# Get rid of one Null row for role
df = df[df.role.notnull()]

## Preprocess data and remove stopwords

There are 'roles' located in the 'responsibility' column.  They must be removed to better match not on the roles themselves but on the responsibilities.  

One problem with the job responsibilities is that we noticed that several roles are cut off in the middle of the explanation, which may hinder optimized NLP analysis.

Also, the job responsibilities must be preprocessed for normalization, removing special characters, numbers, multiple spaces, shorten words with ing, etc.  This will increase the odds of having matching words within responsibilities for a better role match.

In [4]:
import re

# Take out role in functional responsibility
without_role = []

for r,fr in zip(df.role, df['functional_responsibility']):
    without_role.append(fr.replace(str(r),''))


# Preprocess data for normalization
preprocessed = []

for row in without_role:
    phrase = re.sub('[\n]+',' ', row) # Substitute new line characters for spaces
    phrase = phrase.lower() # Lowercase all values
    phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase) # remove special characters
    phrase = re.sub('ing', ' ', phrase) # remove ing
    phrase = re.sub(r'[0-9]+', ' ', phrase) # remove numbers
    phrase = re.sub(' +', ' ', phrase) # remove double spaces
    preprocessed.append(phrase)
    
    
# Assign new data to a column and strip spaces
df['responsibility'] = preprocessed
df['responsibility'] = df.responsibility.str.strip()
df.head(1)

Unnamed: 0,role,2018_2019,2019_2020,2020_2021,2021_2022,2022_2023,2023_2024,2024_2025,education,functional_responsibility,...,2022_2023_min,2023_2024_min,2024_2025_min,1819_1920_change,1920_2021_change,2021_2122_change,2122_2223_change,2223_2324_change,2324_2425_change,responsibility
0,Senior Program Manager,442.04,454.86,468.05,481.62,,,,Bachelor's,The Senior Program Manager has overall account...,...,,,,0.029002,0.028998,0.028993,,,,the has overall accountability for business so...


Contractions are shortened words, e.g., don’t and can’t. Expanding such words to “do not” and “can not” helps to standardize text.

In [5]:
# Try and think of ways to expand contractions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# Apply contraction
contracted = []
for row in df.responsibility:
    contracted.append(decontracted(row))
    
# reassign contraction
df['responsibility'] = contracted
df.head(1)

Unnamed: 0,role,2018_2019,2019_2020,2020_2021,2021_2022,2022_2023,2023_2024,2024_2025,education,functional_responsibility,...,2022_2023_min,2023_2024_min,2024_2025_min,1819_1920_change,1920_2021_change,2021_2122_change,2122_2223_change,2223_2324_change,2324_2425_change,responsibility
0,Senior Program Manager,442.04,454.86,468.05,481.62,,,,Bachelor's,The Senior Program Manager has overall account...,...,,,,0.029002,0.028998,0.028993,,,,the has overall accountability for business so...


Stopwords are the most common words in any natural language. For the purpose of analyzing text data and building NLP models, these stopwords might not add much value to the meaning of the document. Generally, the most common words used in a text are “the”, “is”, “in”, “for”, “where”, “when”, “to”, “at” etc.

In [6]:
# Encountering an error with nltk not being defined in order to download stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('stopwords')

In [7]:
# Printed and copy stopwords instead
print(len(stopwords.words('english')),'\n', stopwords.words('english'))

179 
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'th

In [8]:
# Place stop words in list
my_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 
                'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 
                'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 
                "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 
                'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
                'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 
                'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 
                'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 
                'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 
                'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
                'through', 'during', 'before', 'after', 'above', 'below', 'to', 
                'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
                'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 
                'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 
                'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 
                'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 
                "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 
                've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', 
                "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 
                'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', 
                "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 
                'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# Create stopword removal function
def remove_mystopwords(sentence):
    text_tokens = sentence.split(" ")
    tokens_filtered= [word for word in text_tokens if not word in my_stopwords]
    return (" ").join(tokens_filtered)

# remove stopwords
df['responsibility'] = df.responsibility.apply(remove_mystopwords)
df['responsibility'] = df.responsibility.str.strip()
df.responsibility[:10]

0    overall accountability business solution progr...
1    plan manage projects control overall project s...
2    manages plans coordinates activities projects ...
3    apply broad management skills specialized func...
4    industry experience relevant subject matter in...
5    industry experience relevant subject matter in...
6    industry experience relevant subject matter in...
7    provides thought leadership related current fu...
8    provides thought leadership related current fu...
9    may develop run tests implement maintain opera...
Name: responsibility, dtype: object

Lemmatization is the process of converting a word to its base form, e.g., “caring” to “care”. We use spaCy’s lemmatizer to obtain the lemma, or base form, of the words. Sample code:

In [9]:
import spacy
nlp = spacy.load('en_core_web_sm') # load spacy model, can be "en_core_web_sm"

# Lemmatization example
text = """he kept eating while we are talking"""
doc = nlp(text)
mytokens = [word.lemma_ if word.lemma_ != "-PrON-" else word.lower_ for word in doc]
print(mytokens) 

['-PRON-', 'keep', 'eat', 'while', '-PRON-', 'be', 'talk']


In [10]:
# Apply lemmatization in order to get the base form of a word
lemmatized = []

for text in df['responsibility']:
    doc = nlp(text)
    # Lemmatizing each token
    mytokens = [word.lemma_ if word.lemma_ != "-PrON-" else word.lower_ for word in doc]
    join_list = (' ').join(mytokens) # Join list of words into a sentence
    lemmatized.append(join_list) # Append to a list
    
df['responsibility'] = lemmatized
df.responsibility[:10]

0    overall accountability business solution progr...
1    plan manage project control overall project sc...
2    manages plan coordinate activity project indiv...
3    apply broad management skill specialized funct...
4    industry experience relevant subject matter in...
5    industry experience relevant subject matter in...
6    industry experience relevant subject matter in...
7    provide think leadership relate current future...
8    provide think leadership relate current future...
9    may develop run test implement maintain operat...
Name: responsibility, dtype: object

___

After initial preprocessing, I've noticed that many of the different level roles have the same job responsibility which will hurt separating role levels.  This is only from the CIOSP3 data.

## Cosine Similarity

In [11]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn


# Instaniate our lookup hash table
group_lookup = {}


# Write a function for cleaning strings and returning an array of ngrams
def ngrams_analyzer(string):
    string = re.sub(r'[,-./]', r'', string)
    ngrams = zip(*[string[i:] for i in range(5)])  # N-Gram length is 5
    return [''.join(ngram) for ngram in ngrams]


def find_group(row, col):
    # If either the row or the col string have already been given
    # a group, return that group. Otherwise return none
    if row in group_lookup:
        return group_lookup[row]
    elif col in group_lookup:
        return group_lookup[col]
    else:
        return None


def add_vals_to_lookup(group, row, col):
    # Once we know the group name, set it as the value
    # for both strings in the group_lookup
    group_lookup[row] = group
    group_lookup[col] = group


def add_pair_to_lookup(row, col):
    # in this function we'll add both the row and the col to the lookup
    group = find_group(row, col)  # first, see if one has already been added
    if group is not None:
        # if we already know the group, make sure both row and col are in lookup
        add_vals_to_lookup(group, row, col)
    else:
        # if we get here, we need to add a new group.
        # The name is arbitrary, so just make it the row
        add_vals_to_lookup(row, row, col)


# Construct vectorizer for building the TF-IDF matrix
vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)

# Grab the column you'd like to group, filter out duplicate values
# and make sure the values are Unicode
vals = df['role'].unique().astype('U')

# Build the matrix!!!
tf_idf_matrix = vectorizer.fit_transform(vals)

cosine_matrix = awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), vals.size, 0.8)

# Build a coordinate matrix
coo_matrix = cosine_matrix.tocoo()

# for each row and column in coo_matrix
# if they're not the same string add them to the group lookup
for row, col in zip(coo_matrix.row, coo_matrix.col):
    if row != col:
        add_pair_to_lookup(vals[row], vals[col])

df['Group'] = df['role'].map(group_lookup).fillna(df['role'])

In [12]:
df.head()

Unnamed: 0,role,2018_2019,2019_2020,2020_2021,2021_2022,2022_2023,2023_2024,2024_2025,education,functional_responsibility,...,2023_2024_min,2024_2025_min,1819_1920_change,1920_2021_change,2021_2122_change,2122_2223_change,2223_2324_change,2324_2425_change,responsibility,Group
0,Senior Program Manager,442.04,454.86,468.05,481.62,,,,Bachelor's,The Senior Program Manager has overall account...,...,,,0.029002,0.028998,0.028993,,,,overall accountability business solution progr...,Senior Program Manager
1,Program Manager,389.42,400.71,412.33,424.29,,,,Bachelor's,Program Managers plan and manage projects to c...,...,74.81,,0.028992,0.028999,0.029006,,,,plan manage project control overall project sc...,Program Manager
2,Project Manager,252.85,260.18,267.73,275.49,,,,Bachelor's,"The Project Manager manages, plans and coordin...",...,67.63,134.58,0.02899,0.029018,0.028984,,,,manages plan coordinate activity project indiv...,Project Manager
3,Task Manager,190.81,196.34,202.03,207.89,,,,Bachelor's,Task Managers apply their broad management ski...,...,,,0.028982,0.02898,0.029006,,,,apply broad management skill specialized funct...,Task Manager
4,Subject Matter Expert 1,225.85,232.4,239.14,246.08,,,,Bachelor's,The Subject Matter Expert 1 has industry exper...,...,,,0.029002,0.029002,0.029021,,,,industry experience relevant subject matter in...,Subject Matter Expert 1


In [13]:
group_len = len(df.Group.value_counts())
df.Group.value_counts()

Subject Matter Expert 1                   179
Computer Systems Analyst                  168
Database Specialist IV                    166
Database Management Specialist            163
Business Process Reengineering Analyst    163
                                         ... 
Staff Scientist                             1
Web Tech Administrator                      1
Computer Systems Programmer                 1
Eng Senior: Program Manager                 1
Principal Systems Analyst                   1
Name: Group, Length: 404, dtype: int64

In [14]:
role_len = len(df.role.value_counts())
df.role.value_counts()

Program Manager                              60
Network Administrator                        56
Help Desk Specialist                         56
Web Designer                                 56
Database Administrator                       56
                                             ..
Expert Technologist I                         1
Consultant 1                                  1
Customer Service and Support Technician 2     1
Business Analyst                              1
Administrative Support Specialist 2           1
Name: role, Length: 675, dtype: int64

In [15]:
# Percentage difference
per_diff = (abs(role_len - group_len)) / ((role_len + group_len) / 2)
per_diff

0.5023169601482854

In [16]:
# Percentage change
per_change = (group_len - role_len) / role_len
per_change

-0.4014814814814815

In [17]:
df[df.Group == 'Subject Matter Expert 1'].role.value_counts()

Subject Matter Expert Level II     54
Subject Matter Expert Level I      54
Subject Matter Expert Level III    54
Subject Matter Expert III           3
Subject Matter Expert I             3
Subject Matter Expert II            3
Subject Matter Expert IV            3
Subject Matter Expert 1             1
Subject Matter Expert 2             1
Subject Matter Expert VI            1
Subject Matter Expert V             1
Subject Matter Expert 3             1
Name: role, dtype: int64

In [18]:
# See if many of these roles have the same responsibility in various companies
resp_check = df[df.Group == 'Subject Matter Expert 1']
resp_check.drop_duplicates(subset='company')
# Which they do

Unnamed: 0,role,2018_2019,2019_2020,2020_2021,2021_2022,2022_2023,2023_2024,2024_2025,education,functional_responsibility,...,2023_2024_min,2024_2025_min,1819_1920_change,1920_2021_change,2021_2122_change,2122_2223_change,2223_2324_change,2324_2425_change,responsibility,Group
4,Subject Matter Expert 1,225.85,232.4,239.14,246.08,,,,Bachelor's,The Subject Matter Expert 1 has industry exper...,...,,,0.029002,0.029002,0.029021,,,,industry experience relevant subject matter in...,Subject Matter Expert 1
379,Subject Matter Expert IV,242.59,242.59,242.59,242.59,,,,PhD,"Provides executive-level technical, managerial...",...,,,0.0,0.0,0.0,,,,provide executive level technical managerial a...,Subject Matter Expert 1
457,Subject Matter Expert IV,320.47,327.68,335.05,,,,,Bachelor’s Degree,Serve as subject matter expert in a functional...,...,,,0.022498,0.022491,,,,,serve subject matter expert functional technic...,Subject Matter Expert 1
584,Subject Matter Expert I,49.39,50.33,51.29,52.26,,,,Associate's,Use Oracle-based system in a Client/Server env...,...,,,0.019032,0.019074,0.018912,,,,use oracle base system client server environme...,Subject Matter Expert 1
6384,Subject Matter Expert Level I,165.2,168.5,171.87,175.31,,,,,Subject Matter Expert Level I Provide technica...,...,,,0.019976,0.02,0.020015,,,,provide technical knowledge analysis highly sp...,Subject Matter Expert 1
6385,Subject Matter Expert Level I,175.02,180.42,185.94,191.69,,,,,Subject Matter Expert ? Level I Provide techni...,...,,,0.030854,0.030595,0.030924,,,,subject matter expert level provide technical ...,Subject Matter Expert 1
6386,Subject Matter Expert Level I,117.74,120.1,122.49,124.96,,,,,Subject Matter Expert ? Level I Provide techni...,...,,,0.020044,0.0199,0.020165,,,,subject matter expert level provide technical ...,Subject Matter Expert 1
6388,Subject Matter Expert Level I,165.24,170.2,175.32,180.6,,,,,Subject Matter Expert ? Level I Provide techni...,...,,,0.030017,0.030082,0.030116,,,,subject matter expert level provide technical ...,Subject Matter Expert 1
6389,Subject Matter Expert Level I,147.73,151.72,155.82,160.03,,,,,Subject Matter Expert ? Level I Provide techni...,...,,,0.027009,0.027023,0.027018,,,,subject matter expert level provide technical ...,Subject Matter Expert 1
6390,Subject Matter Expert Level I,117.04,119.96,122.96,126.04,,,,,Subject Matter Expert ? Level I Provide techni...,...,,,0.024949,0.025008,0.025049,,,,subject matter expert level provide technical ...,Subject Matter Expert 1


In [None]:
# Construct vectorizer for building the TF-IDF matrix
vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)

# Grab the column you'd like to group, filter out duplicate values
# and make sure the values are Unicode
vals = df['responsibility'].unique().astype('U')

# Build the matrix!!!
tf_idf_matrix = vectorizer.fit_transform(vals)

cosine_matrix = awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), vals.size, 0.8)

# Build a coordinate matrix
coo_matrix = cosine_matrix.tocoo()

# for each row and column in coo_matrix
# if they're not the same string add them to the group lookup
for row, col in zip(coo_matrix.row, coo_matrix.col):
    if row != col:
        add_pair_to_lookup(vals[row], vals[col])

df['Group_2'] = df['responsibility'].map(group_lookup).fillna(df['responsibility'])

In [None]:
resp_len = len(df.responsibility.value_counts())
df['responsibility'].value_counts()

In [None]:
group2_len = len(df.Group_2.value_counts())
df.Group_2.value_counts()

In [None]:
# Percentage difference
per_diff = (abs(resp_len - group2_len)) / ((resp_len + group2_len) / 2)
per_diff

In [None]:
# Percentage change
per_change = (group2_len - resp_len) / resp_len
per_change

In [None]:
phrase = 'organizes direct network installation site survey assess document current site network configuration user requirement design optimize network topology direct lead preparation engineer plan site installation technical design package develop installation schedule prepare site installation test report coordinate post installation operation maintenance support'

In [None]:
# Look into the roles more individually to see what the actual roles are
df[df.Group_2.str.contains(phrase)].role.value_counts()

In [None]:
phrase = 'assist daily activity configuration operation system may mainframe mini client server base assist optimiz system operation resource utilization perform system capacity analysis plann provide assistance user access us business system commensurate experience education'

In [None]:
# Look into the roles more individually to see what the actual roles are
df[df.Group_2.str.contains(phrase)].role.value_counts()

In [None]:
# See if many of these roles have the same responsibility in various companies
resp_check = df[df.Group_2 == phrase]
resp_check.drop_duplicates(subset='company')
# Which they do

In [None]:
# Adjusted cosine cut off to 0.6

In [None]:
phrase = 'provide configuration management plann describe provision configuration identification change control configuration status account configuration audits regulate change process approve validate change incorporate product document relate software'

In [None]:
df[df.Group_2.str.contains(phrase)].role.value_counts()

In [None]:
# partial phrase
phrase = 'measurement technique'
# Note that the substring has to match a substring within the string, 2 words can't be in 2 different places
df[df.Group_2.str.contains(phrase)].role.value_counts()

In [None]:
# Create away to put in a series of words and have it return all responsibilities with it, then all roles
# Must turn haystack into df.role and loop through each one
import re

def find_string(needle, haystack):
    re_string = '.*'.join(needle)
    rEGEX = re.compile(re_string)
    return rEGEX.findall(haystack)

___
## Topic Modeling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import nltk
nltk.download('stopwords')
import pandas as pd
import re
import math

In [None]:
# Tokenize text (split words into a list)
def split_line(text):
    # split the text
    words = text.split()
    return words

df['Tokenized'] = df.responsibility.apply(split_line)
df.head()

In [None]:
# Create bag of words and find most frequent words
def get_most_freq_words(str, n=None):
    vect = CountVectorizer().fit(str)
    bag_of_words = vect.transform(str)
    sum_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    freq =sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]
  
get_most_freq_words([ word for responsibility in df.Tokenized for word in responsibility],10)

In [None]:
# Find number of topics


# build a dictionary where for each tweet, each word has its own id.
# We have 6882 tweets and 10893 words in the dictionary.
responsibility_dictionary = Dictionary(df.Tokenized)

# build the corpus i.e. vectors with the number of occurence of each word per tweet
responsibility_corpus = [responsibility_dictionary.doc2bow(responsibility) for responsibility in df.Tokenized]

# compute coherence
responsibility_coherence = []
for nb_topics in range(1,36):
    lda = LdaModel(responsibility_corpus, num_topics = nb_topics, id2word = responsibility_dictionary, passes=10)
    cohm = CoherenceModel(model=lda, corpus=responsibility_corpus, dictionary=responsibility_dictionary, coherence='u_mass')
    coh = cohm.get_coherence()
    responsibility_coherence.append(coh)

In [None]:
# visualize coherence (number of topics is where the plateau begins)
plt.figure(figsize=(10,5))
plt.plot(range(1,36),responsibility_coherence)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.show()

In [None]:
k = 15
responsibility_lda = LdaModel(responsibility_corpus, num_topics = k, id2word = responsibility_dictionary, passes=10)

def plot_top_words(lda=responsibility_lda, nb_topics=k, nb_words=10):
    top_words = [[word for word,_ in lda.show_topic(topic_id, topn=50)] for topic_id in range(lda.num_topics)]
    top_betas = [[beta for _,beta in lda.show_topic(topic_id, topn=50)] for topic_id in range(lda.num_topics)]

    gs  = gridspec.GridSpec(round(math.sqrt(k))+1,round(math.sqrt(k))+1)
    gs.update(wspace=0.5, hspace=0.5)
    plt.figure(figsize=(20,15))
    for i in range(nb_topics):
        ax = plt.subplot(gs[i])
        plt.barh(range(nb_words), top_betas[i][:nb_words], align='center',color='blue', ecolor='black')
        ax.invert_yaxis()
        ax.set_yticks(range(nb_words))
        ax.set_yticklabels(top_words[i][:nb_words])
        plt.title("Topic "+str(i))
        
  
plot_top_words()

In [None]:
# partial phrase
phrase = 'implement database security'
# Note that the substring has to match a substring within the string, 2 words can't be in 2 different places
df[df.Group_2.str.contains(phrase)].role.value_counts()

In [None]:
# partial phrase
phrase = 'infrastructure'
# Note that the substring has to match a substring within the string, 2 words can't be in 2 different places
df[df.Group_2.str.contains(phrase)].role.value_counts()

In [None]:
# This is not good as well because we poses an 'OR' statement but just drops duplicates
phrase = 'implement database security'
role_search = split_line(phrase)

df2 = df[df.responsibility.str.contains('|'.join(role_search))].drop_duplicates('responsibility')
df2

In [None]:
phrase = 'implement test run'
role_search = split_line(phrase)

for tokenized in df.Tokenized:
    if all(item in tokenized for item in role_search):
        print(tokenized)

In [None]:
phrase = 'implement test run'
role_search = split_line(phrase)

for responsibility in df.responsibility:
    if all(item in responsibility for item in role_search):
        print(responsibility)

In [None]:
phrase = 'implement test run'
role_search = split_line(phrase)

roles = []
for responsibility in df.responsibility:
    if all(item in responsibility for item in role_search):
        roles.append(responsibility)
        
df2 = df[df.responsibility.str.contains('|'.join(roles))].drop_duplicates('responsibility')
df2

In [None]:
phrase = 'analyze system'
role_search = split_line(phrase)

roles = []
for responsibility in df.responsibility:
    if all(item in responsibility for item in role_search):
        roles.append(responsibility)
        
df2 = df[df.responsibility.str.contains('|'.join(roles))].drop_duplicates('responsibility')
df2

In [None]:
df2.role.value_counts()

In [None]:
df2.company.value_counts()

In [None]:
df2.responsibility.nunique()

In [None]:
plot_top_words()

In [None]:
def find_role(phrase):
    global df2
    role_search = split_line(phrase)
    
    roles = []
    for responsibility in df.responsibility:
        if all(item in responsibility for item in role_search):
            roles.append(responsibility)
        
    df2 = df[df.responsibility.str.contains('|'.join(roles))].drop_duplicates('responsibility')
    df2 = df2.drop(['Group', 'Group_2', 'Tokenized'], axis=1)
    
    import sweetviz as sv

    my_report = sv.analyze(df2)
    # my_report.show_html() # Default arguments will generate to "SWEETVIZ_rEPOrT.html"
    return my_report.show_notebook()
    

In [None]:
df[df.company == 'Gunnison Consulting']

In [None]:
find_role('implement database security')

In [None]:
df2