IMPORTS

In [34]:
import re
import itertools
from collections import defaultdict
import csv

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy and nltk
import spacy
from nltk.corpus import stopwords

#visuals
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

#pandas for reading excel file
import pandas as pd

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


PREPARING THE DATA

In [35]:
df = pd.read_csv("allData.csv")
data = df["JOB DESCRIPTION"]


STOPWORDS

In [36]:
stopwords = stopwords.words("english")
stopwords.extend(['actual', 'additional', 'advancement_opportunitie', 'airline', 'also', 'always', 'andor', 'anywhere',
                  'applicable', 'applicant', 'application', 'apply', 'approximately', 'area', 'available', 'aviation', 'away',
                  'benefit', 'bonus', 'candidate', 'club', 'color', 'company', 'companypaid',
                  'compensation', 'complete', 'condition', 'consideration', 'covid', 'crew', 'current',
                  'date', 'degree', 'demonstrate', 'disability', 'do',  'duty',
                  'eg', 'employee', 'employer', 'employment', 'equity', 'especially', 'ethic', 'even', 'ever', 
                  'fixedwe', 'flight', 'fly', 'fom', 'gather', 'gender', 'get', 'gs', 'healthcare', 'hire', 'hiring', 'however', 'incentive',
                  'include', 'income', 'insurance', 'job', 'least', 'letter', 'little', 'look', 'make', 'manner', 'many', 'marital',
                  'match', 'medical', 'memeber', 'military', 'much', 'nd', 'offer', 'nearly', 'often', 'oh', 'on', 'opportunity',
                  'other', 'otherwise', 'package', 'parental', 'part', 'passport', 'pay', 'payment', 'perform', 'person', 'pilot', 'policy',
                  'position', 'primary', 'pregnancy', 'race', 'rd', 're', 'regard', 'relate', 'religion', 'resume', 'retire', 'retirement',
                  'salary', 'self', 'set', 'sex', 'skill', 'sometimes', 'special', 'state', 'st', 'still', 'stipend', 'submit', 'total', 'truly', 'type',
                  'unusual', 'use', 'veteran', 'well', 'whole', 'will', 'work'])

REMOVE PUNCTUATION

In [37]:
temp = []

for text in data:
    if isinstance(text, str):
        temp.append(text.replace('\n', ' '))
    else:
        pass


data = temp

data = [re.sub(r'[^\w\s]','', text) for text in data]

In [38]:
def removeStopwords(texts):
    returnList = []
    for text in texts:
        textArr = text.split(' ')
        string = " ".join([i for i in textArr if i.lower() not in stopwords])
        returnList.append(string)
        
    return returnList

stopwordsData = removeStopwords(data)

remove URLs

In [39]:
noURLData = [re.sub('http://\S+|https://\S+', '', text) for text in stopwordsData]

LEMMATIZE

In [40]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

lemmatizedTexts = lemmatization(noURLData)

print(lemmatizedTexts[1])

mission provide airborne platform support specific research development program provide method validation field collect datum aircraft fly maintain manage professional staff test pilot certify maintenance technician administrative personnel first priority safety operation conduct use procedure equipment meet exceed requirement result various past airborne testing program missionspecific procedure develop procedure provide safe effective successful operation create support early airtoair collision avoidance research program sponsor need extensive airborne testing increase expand support variety program currently operate experimentally certificate commercially derivative airborne test bed range size small single engine large turboprop corporate jet essential member team responsible conducting mission safety test planning operate highly modify man unmanned aircraft purpose call evaluate handling quality performance modify test bed aircraft operate aircraft sensor datum collection flight t

LOWERCASE WORDS, REMOVE SHORT AND LONG WORDS

In [41]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

dataWords = gen_words(lemmatizedTexts)

print (dataWords[1])

['mission', 'provide', 'airborne', 'platform', 'support', 'specific', 'research', 'development', 'program', 'provide', 'method', 'validation', 'field', 'collect', 'datum', 'aircraft', 'fly', 'maintain', 'manage', 'professional', 'staff', 'test', 'pilot', 'certify', 'maintenance', 'technician', 'administrative', 'personnel', 'first', 'priority', 'safety', 'operation', 'conduct', 'use', 'procedure', 'equipment', 'meet', 'exceed', 'requirement', 'result', 'various', 'past', 'airborne', 'testing', 'program', 'missionspecific', 'procedure', 'develop', 'procedure', 'provide', 'safe', 'effective', 'successful', 'operation', 'create', 'support', 'early', 'airtoair', 'collision', 'avoidance', 'research', 'program', 'sponsor', 'need', 'extensive', 'airborne', 'testing', 'increase', 'expand', 'support', 'variety', 'program', 'currently', 'operate', 'experimentally', 'certificate', 'commercially', 'derivative', 'airborne', 'test', 'bed', 'range', 'size', 'small', 'single', 'engine', 'large', 'turb

CREATE BIGRAMS AND TRIGRAMS

In [42]:

bigram_phrases = gensim.models.Phrases(dataWords, min_count=2, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[dataWords], threshold=100, min_count=2)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

dataBigrams = make_bigrams(dataWords)
dataFinal = make_trigrams(dataBigrams)


#remove bigrams

print(dataFinal[1])

['mission', 'provide', 'airborne', 'platform', 'support', 'specific', 'research', 'development', 'program', 'provide', 'method', 'validation', 'field', 'collect', 'datum', 'aircraft', 'fly', 'maintain', 'manage', 'professional', 'staff', 'test', 'pilot', 'certify', 'maintenance', 'technician', 'administrative', 'personnel', 'first', 'priority', 'safety', 'operation', 'conduct', 'use', 'procedure', 'equipment', 'meet', 'exceed', 'requirement', 'result', 'various', 'past', 'airborne', 'testing', 'program', 'missionspecific', 'procedure', 'develop', 'procedure', 'provide', 'safe', 'effective', 'successful', 'operation', 'create', 'support', 'early', 'airtoair', 'collision', 'avoidance', 'research', 'program', 'sponsor', 'need', 'extensive', 'airborne', 'testing', 'increase', 'expand', 'support', 'variety', 'program', 'currently', 'operate', 'experimentally', 'certificate', 'commercially', 'derivative', 'airborne', 'test', 'bed', 'range', 'size', 'small', 'single_engine', 'large', 'turbopr

In [43]:
id2word = corpora.Dictionary(dataFinal)

texts = dataFinal
corpus = [id2word.doc2bow(text) for text in texts]

'''
length = (len(corpus))
print(len(id2word))
totalCount = defaultdict(int)

for wordID, wordCount in itertools.chain.from_iterable(corpus):
    totalCount[wordID] += wordCount

sortedList = sorted(totalCount.items(), key=lambda x: x[1], reverse=True)

num = len(sortedList) * 3 // 20

topWords = []

for i in range(num):
    index = sortedList[i][0]
    topWords.append(id2word[index])
print(len(topWords))
print(topWords)



with open('topWords.csv', 'w', newline = '') as file:
    writer = csv.writer(file)
    
    for word in topWords:
        writer.writerow([word])
#within each doc, different num of words and freqs
# go through each doc, and

#every word per document is sorted by ascending order
'''


"\nlength = (len(corpus))\nprint(len(id2word))\ntotalCount = defaultdict(int)\n\nfor wordID, wordCount in itertools.chain.from_iterable(corpus):\n    totalCount[wordID] += wordCount\n\nsortedList = sorted(totalCount.items(), key=lambda x: x[1], reverse=True)\n\nnum = len(sortedList) * 3 // 20\n\ntopWords = []\n\nfor i in range(num):\n    index = sortedList[i][0]\n    topWords.append(id2word[index])\nprint(len(topWords))\nprint(topWords)\n\n\n\nwith open('topWords.csv', 'w', newline = '') as file:\n    writer = csv.writer(file)\n    \n    for word in topWords:\n        writer.writerow([word])\n#within each doc, different num of words and freqs\n# go through each doc, and\n\n#every word per document is sorted by ascending order\n"

NUMBER OF TOPICS TESTING

In [44]:
def computeCoherenceValues(corpus, id2word, k):
    ldaTest = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=k, id2word=id2word, chunksize=100, passes= 20, alpha = 'auto', random_state= 100, update_every=1)
 
    
    # * different measures
    # * c_v, umass, c_uci
    
    coherenceTest = CoherenceModel(model=ldaTest, corpus=corpus, texts=texts, dictionary= id2word, coherence='c_uci')
    coherenceScore = coherenceTest.get_coherence()
    print('Completed ' + str(k) + ' topics')
    print('Coherence Score: ', round(coherenceScore, 5))
    print('\n')
    return coherenceScore

def computePerplexityValues(corpus, id2word, k):
    ldaTest = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=k, id2word=id2word, chunksize=100, passes= 20, alpha = 'auto', random_state= 100, update_every=1)
    perplexScore = ldaTest.log_perplexity(corpus)
    print('Completed ' + str(k) + ' topics')
    print('Perplexity Value: ', perplexScore)
    print('\n')
    return perplexScore

#coherence testing
if not True:
    topicNum = []
    coherenceScore = []
    for k in range(1, 26):
        # get the coherence score for the given parameters
        cv = computeCoherenceValues(corpus=corpus, id2word=id2word, k=k)
        topicNum.append(k)   
        coherenceScore.append(cv)
        
    plt.plot(topicNum, coherenceScore, marker = 'o')
    plt.xlabel('Number of Topics')
    plt.ylabel('Values')
    plt.title('C_UCI Coherence Values of Different Number of Topics')
    plt.savefig('test.png', bbox_inches = 'tight')
        
#perplexity testing
if not True:
    topicNum = []
    perplexityScore = []
    for k in range(1, 20):
        # get the coherence score for the given parameters
        pv = computePerplexityValues(corpus=corpus, id2word=id2word, k=k)
        topicNum.append(k)   
        perplexityScore.append(pv)
        
    plt.plot(topicNum, perplexityScore, marker = 'o')
    plt.xlabel('Number of Topics')
    plt.ylabel('Values')
    plt.title('Perplexity Values of Different Number of Topics')
    plt.savefig('PerplexityChart3.png', bbox_inches = 'tight')
        
        

LDA MODEL

In [45]:
ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=9, random_state=100, 
                                           update_every=1, chunksize= 100, passes=20, alpha="auto")

# coherenceModel = CoherenceModel(model=ldaModel, texts=texts, dictionary=id2word, coherence='c_v')
# print('Coherence value of LDA model: ', round(coherenceModel.get_coherence(), 5))


VISUALIZATION

In [46]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldaModel, corpus, id2word, mds="mmds", R=20)
vis

pyLDAvis.save_html(vis, 'LDAdemo.html')