In [1]:
import spacy
from pprint import pprint

import pandas as pd
import nltk
nltk.download('stopwords')

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Merl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
file_path = "D:/nlp/top_jobs_cs_20_21/part_1/part_1a/p1a.csv"
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [3]:
def sent_to_words(sentences):
    """Lowercases and converts each sentence into a list of words"""
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(sentence))

In [4]:
def remove_stopwords(texts, stop_words=nltk.corpus.stopwords.words('english')):
    "Removes stopwords using a list of stopwords provided"
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [5]:
def make_bigrams(tokens):
    """build the bi-gram models"""
    
    bigram = gensim.models.Phrases(tokens, min_count=3, threshold=10)
    
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    
    return [bigram_mod[doc] for doc in tokens]

In [6]:
def lemmatization(tokens, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    """Do lemmatization keeping only noun, adj, vb, adv"""
    texts_out = []
    for sent in tokens:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [7]:
def prep_data(df):
    """tokenize, remove stopwords, make bigrams and lemmatize the corpus"""

    tokens = list(sent_to_words(list(df["clean_text"])))
    
    tokens_nostops = remove_stopwords(tokens)
    
    tokens_bigrams = make_bigrams(tokens_nostops)

    tokens_lemmatized = lemmatization(tokens_bigrams)
    
    return tokens_lemmatized

In [60]:
def build_topic_models(tokens):
    """train a topic models"""
    #Create data formats necessary to build LDA topic models with gensim
    
    #Create dictionary
    id2word = corpora.Dictionary(tokens)

    # Term Document Frequency from dictionary
    corpus = [id2word.doc2bow(token) for token in tokens]
    
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=10,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    
    return lda_model

---

In [61]:
file_path = "D:/nlp/top_jobs_cs_20_21/part_1/part_1a/p1a.csv"

In [74]:
def main(file_path = "D:/nlp/top_jobs_cs_20_21/part_1/part_1a/p1a.csv"):
    """reads a csv, prepares the data, builds the topic models and prints topic outputs"""
    
    df = pd.read_csv(file_path)
    
    df = df.sample(500)
    
    df = df[df["clean_accuracy"]>0]
    
    tokens_lemmatized = prep_data(df)
    
    lda_model = build_topic_models(tokens_lemmatized)
    
    lda_model.print_topics(num_words=5)

In [75]:
main()

In [65]:
df = pd.read_csv(file_path)
df = df.sample(200)
df = df[df["clean_accuracy"]>0]

In [66]:
#df.head()

In [67]:
tokens_lemmatized = prep_data(df)

In [68]:
id2word = corpora.Dictionary(tokens_lemmatized)

In [69]:
corpus = [id2word.doc2bow(token) for token in tokens_lemmatized]

In [72]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7, 
                                           random_state=100,
                                           update_every=10,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [73]:
lda_model.print_topics()

[(0,
  '0.019*"experience" + 0.007*"development" + 0.007*"security" + 0.006*"tool" + 0.006*"technology" + 0.006*"work" + 0.005*"skill" + 0.005*"application" + 0.005*"testing" + 0.005*"knowledge"'),
 (1,
  '0.013*"client" + 0.009*"requirement" + 0.009*"business" + 0.008*"management" + 0.008*"knowledge" + 0.008*"solution" + 0.005*"system" + 0.005*"company" + 0.005*"basic_knowledge" + 0.005*"tool_apply"'),
 (2,
  '0.021*"experience" + 0.008*"development" + 0.008*"work" + 0.007*"skill" + 0.006*"solution" + 0.006*"technology" + 0.005*"knowledge" + 0.005*"net" + 0.005*"design" + 0.005*"project"'),
 (3,
  '0.005*"eee" + 0.005*"ssrs" + 0.004*"experience" + 0.004*"brand" + 0.004*"ete" + 0.004*"web" + 0.003*"technology" + 0.003*"colombo" + 0.003*"development" + 0.003*"email"'),
 (4,
  '0.013*"team" + 0.012*"group" + 0.010*"business" + 0.008*"service" + 0.008*"market" + 0.006*"work" + 0.006*"skill" + 0.006*"experience" + 0.006*"support" + 0.006*"system"'),
 (5,
  '0.016*"eee" + 0.010*"experience"