In [2]:
import pandas as pd

In [24]:
import spacy
from pprint import pprint

In [3]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Merl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
file_path = "E:/ADB_Project/code/data/pipeline_sample.csv"
df = pd.read_csv(file_path)

### Tokenize words

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(sentence))

### Stopword Removal

In [18]:
def remove_stopwords(texts):
    stop_words = nltk.corpus.stopwords.words('english')
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

### Build Bigram and Trigram Models

In [8]:
def make_bigrams(tokens, bigram):
    
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    
    return [bigram_mod[doc] for doc in tokens]

In [12]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(tokens, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in tokens:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

---

In [14]:
file_path = "E:/ADB_Project/code/data/pipeline_sample.csv"
df = pd.read_csv(file_path)

In [19]:
tokens = list(sent_to_words(list(df["clean"])))
tokens_nostops = remove_stopwords(tokens)

bigrams = gensim.models.Phrases(tokens, min_count=3, threshold=10)
tokens_bigrams = make_bigrams(tokens_nostops, bigrams)

In [20]:
tokens_lemmatized = lemmatization(tokens_bigrams)

In [21]:
# Create Dictionary
id2word = corpora.Dictionary(tokens_lemmatized)

# Create Corpus
texts = tokens_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [22]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=10,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [25]:
#Print the Keyword in the 20 topics
pprint(lda_model.print_topics())

[(0,
  '0.015*"experience" + 0.010*"profile" + 0.009*"knowledge" + 0.009*"system" + '
  '0.009*"talent_search" + 0.006*"candidate" + 0.006*"client" + 0.006*"make" + '
  '0.006*"attractive_remuneration" + 0.006*"opportunity"'),
 (1,
  '0.022*"experience" + 0.012*"work" + 0.010*"skill" + 0.009*"knowledge" + '
  '0.007*"requirement" + 0.007*"ability" + 0.007*"test" + 0.007*"technology" + '
  '0.007*"business" + 0.006*"tool"'),
 (2,
  '0.015*"experience" + 0.010*"development" + 0.009*"work" + 0.008*"service" + '
  '0.007*"knowledge" + 0.006*"software" + 0.006*"cloud" + 0.006*"design" + '
  '0.006*"team" + 0.006*"solution"'),
 (3,
  '0.011*"experience" + 0.006*"work" + 0.005*"look" + 0.005*"server" + '
  '0.005*"system" + 0.004*"year" + 0.004*"holding" + 0.004*"good" + '
  '0.003*"design" + 0.003*"development"'),
 (4,
  '0.019*"experience" + 0.011*"team" + 0.009*"skill" + 0.007*"oracle" + '
  '0.007*"product" + 0.007*"design" + 0.006*"web" + 0.006*"work" + '
  '0.005*"development" + 0.005*"