In [2]:
import spacy
from pprint import pprint

import pandas as pd
import nltk
nltk.download('stopwords')

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Merl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
file_path = "D:/nlp/top_jobs_cs_20_21/part_1/part_1a/p1a.csv"
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [6]:
df = pd.read_csv(file_path)

In [7]:
df.head(3)

Unnamed: 0,vacancy_id,file_path,ocrd_text,clean_text,plain_accuracy,clean_accuracy
0,660381,D:/nlp/top_jobs_cs_20_21/part_1/part_1a/660381...,\n\n \n\n \n\n= Tee 8\n\nSenior Software En...,Tee 8 Senior Software Engineer PHP/Confluence...,0.765823,0.845161
1,660386,D:/nlp/top_jobs_cs_20_21/part_1/part_1a/660386...,\n\n \n\n \n\n= Tee 8\n\nSenior Software En...,Tee 8 Senior Software Engineer PHP/Confluence...,0.765823,0.845161
2,660403,D:/nlp/top_jobs_cs_20_21/part_1/part_1a/660403...,Rea\n\nunain WE'RE HIRING!\n\n \n\nTEST AUTOMA...,Rea unain WERE HIRING TEST AUTOMATION ENGINEER...,0.666667,0.73913


### Tokenize words

In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(sentence))

### Stopword Removal

In [9]:
def remove_stopwords(texts):
    stop_words = nltk.corpus.stopwords.words('english')
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

### Build Bigram and Trigram Models

In [10]:
def make_bigrams(tokens, bigram):
    
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    
    return [bigram_mod[doc] for doc in tokens]

In [11]:
def lemmatization(tokens, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in tokens:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

---

In [22]:
df.drop(df[df['plain_accuracy'] == 0].index, inplace = True)

In [23]:
tokens = list(sent_to_words(list(df["clean_text"])))
tokens_nostops = remove_stopwords(tokens)

In [24]:
bigrams = gensim.models.Phrases(tokens, min_count=3, threshold=10)
tokens_bigrams = make_bigrams(tokens_nostops, bigrams)

In [25]:
tokens_lemmatized = lemmatization(tokens_bigrams)

In [26]:
# Create Dictionary
id2word = corpora.Dictionary(tokens_lemmatized)

# Create Corpus
texts = tokens_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [27]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=10,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [28]:
#Print the Keyword in the 20 topics
pprint(lda_model.print_topics())

[(0,
  '0.042*"experience" + 0.017*"knowledge" + 0.015*"development" + '
  '0.011*"information_technology" + 0.010*"service" + 0.010*"design" + '
  '0.009*"skill" + 0.009*"application" + 0.008*"work" + 0.008*"year"'),
 (1,
  '0.032*"experience" + 0.015*"software" + 0.013*"work" + 0.013*"knowledge" + '
  '0.012*"development" + 0.011*"skill" + 0.010*"team" + 0.008*"year" + '
  '0.007*"solution" + 0.007*"technology"'),
 (2,
  '0.011*"business" + 0.010*"system" + 0.010*"support" + 0.009*"work" + '
  '0.009*"team" + 0.008*"experience" + 0.007*"project" + 0.007*"management" + '
  '0.006*"requirement" + 0.005*"development"'),
 (3,
  '0.006*"work" + 0.006*"ability" + 0.006*"test" + 0.006*"design" + '
  '0.005*"team" + 0.004*"social_media" + 0.003*"content" + 0.003*"knowledge" + '
  '0.003*"experience" + 0.003*"ensure"'),
 (4,
  '0.042*"eee" + 0.016*"ae" + 0.014*"ee" + 0.013*"cee" + 0.013*"see" + '
  '0.011*"eae" + 0.008*"ea" + 0.007*"ec" + 0.007*"ce" + 0.007*"ree"')]
