In [72]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import spacy
import pandas as pd
import re

import warnings
warnings.filterwarnings('ignore')

In [73]:
df = pd.read_csv('/Users/georgengyn/Desktop/Big Data Lab/raw data/testData3.csv')
docs = df['description']

docs = [re.sub(r'http\S+', '', doc) for doc in docs if isinstance(doc, str)]

docs[1:10]

["contour is a long-established aviation services company with diverse capabilities including scheduled airline operations, part 91/135 aircraft management, on-demand charter, fixed-base operations services, an far part 145/easa certified aircraft maintenance facility providing airframe maintenance, avionics installations/repairs, component overhaul services and far 141 pilot training. our fleet of approximately 22 aircraft operates 15,000 flight segments annually and the company is now ranked as one of the largest part 135 operators in the country.\nat contour, our core values are at the heart of everything that we do and every decision that we make. constant throughout all lines of business is a commitment to integrity and safety. our team members display an attitude of excellence and the ability to think like a customer. we recognize that there is strength in unity and work together as a team to accomplish our ambitious goals.\ncontour is an equal opportunity employer. all qualified

In [39]:
def removePunctuation(data):
    
    data = [doc.replace('/', ' ') for doc in data if isinstance(doc, str)]
    data = [re.sub(r'[^\w\s]', ' ', doc) for doc in data]
    
    return data
    
docs = removePunctuation(docs)
docs[:10]


['pilot in command  äì challenger 300 350\n\nfull time   scottsdale  az\njet linx is seeking qualified pilot in command candidates for our heavy jet aircraft for part 91 and 135 flight operations  it is expected that the candidate lives  or be willing to relocate  within a 2 hour call out time of their home base \nposition summary\nthe pilot in command  pic  reports directly to the base chief pilot  the pic prepares and ensures the safe and efficient operation of company aircraft in accordance with the federal aviation regulations and company procedures \nminimum requirements\ntotal time   3500 hours\ntotal pic   2000 hours\ntotal time in type   250 hours\ntotal pic time in type   250 hours\ntotal multi engine   1500 hours\ntotal multi engine pic   500 hours\ntotal turbine   500 hours\ntotal instrument   300 hours\nstrong customer service and communication skills\nproactive decision making skills\npreferred qualifications include \ninitial or recurrent 142 training in type within the l

In [74]:
def lemmatization(data, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm")
    texts_out = []
    for document in data:
        doc = nlp(document)
        new_text = []
        
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
                
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

docs = lemmatization(docs)

docs[:10]

['pilot command äì challenger full time scottsdale jet linx seek qualified pilot command candidate heavy jet aircraft part flight operation expect candidate live willing relocate hour call time home base position summary pilot command pic report directly base chief pilot pic prepare ensure safe efficient operation company aircraft accordance federal aviation regulation company procedure minimum requirement total time hour total pic hour total time type hour total pic time type hour total multi - engine hour total multi - engine pic hour total turbine hour total instrument hour strong customer service communication skill proactive decision make skill preferred qualification include initial recurrent training type last month also prefer other requirement ability obtain driver license proof eligibility work current passport atp ability obtain atp 1st class restrict radio license compensation offer competitive compensation as well range benefit include medical dental vision insurance pto a

In [None]:
def lemmatize(doc):
    lemmatizer = WordNetLemmatizer()
    thing = " ".join(lemmatizer.lemmatize(word) for word in doc.lower().split() if len(word) > 3)
    
    return thing


docs = [lemmatize(doc) for doc in docs]  
docs[:10] 

In [135]:
representation_model = KeyBERTInspired() #improve topic representation
umap_model = UMAP( #reproduce same results
    n_neighbors=15, 
    n_components=5, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=42
)
vectorizer_model = CountVectorizer(stop_words="english")

model = BERTopic(
    representation_model=representation_model, 
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    embedding_model= "all-mpnet-base-v2",
    verbose = True,
    nr_topics=51,
)

In [136]:
topics, probs = model.fit_transform(docs)

model.get_topic_info()

2024-02-22 12:23:51,710 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/67 [00:00<?, ?it/s]

2024-02-22 12:26:47,985 - BERTopic - Embedding - Completed ✓
2024-02-22 12:26:47,992 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-22 12:26:58,699 - BERTopic - Dimensionality - Completed ✓
2024-02-22 12:26:58,700 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-22 12:26:58,750 - BERTopic - Cluster - Completed ✓
2024-02-22 12:26:58,750 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-22 12:27:42,598 - BERTopic - Representation - Completed ✓
2024-02-22 12:27:42,646 - BERTopic - Topic reduction - Reducing number of topics
2024-02-22 12:28:21,558 - BERTopic - Topic reduction - Reduced number of topics from 84 to 51


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,403,-1_airline_faa_southwest_aviation,"[airline, faa, southwest, aviation, employer, ...",[department flight operation company promise c...
1,0,205,0_hire_employment_qualification_management,"[hire, employment, qualification, management, ...",[highlight new hire bonus direct entry captain...
2,1,175,1_aviation_crewmember_aircraft_faa,"[aviation, crewmember, aircraft, faa, pilot, m...",[purpose position executive jet management ber...
3,2,132,2_aviation_requirement_pilot_qualification,"[aviation, requirement, pilot, qualification, ...",[position summary cutter aviation have immedia...
4,3,109,3_aviation_flight_pilot_aircraft,"[aviation, flight, pilot, aircraft, faa, fly, ...",[work exciting endeavor serious commitment bri...
5,4,103,4_pilot_aviation_faa_qualification,"[pilot, aviation, faa, qualification, requirem...",[purpose role serve emb-120 second command ber...
6,5,79,5_employment_aviation_staffing_vacancy,"[employment, aviation, staffing, vacancy, recr...",[want part team aviation innovator help change...
7,6,61,6_hire_pilot_aircraft_job,"[hire, pilot, aircraft, job, plane, flight, cr...",[job description now hire king air pic pilot p...
8,7,55,7_faa_pilot_flight_requirement,"[faa, pilot, flight, requirement, employer, em...",[description immediate opening base sic part o...
9,8,51,8_medflight_aviation_pilot_aircraft,"[medflight, aviation, pilot, aircraft, flight,...",[welcome mission traverse sky manner earn trus...


In [137]:
documents = pd.DataFrame(
    {"Document": docs,
    "ID": range(len(docs)),
    "Topic": topics}
)
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = model._preprocess_text(documents_per_topic.Document.values)

vectorizer = model.vectorizer_model
analyzer = vectorizer.build_analyzer()

tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = Dictionary(tokens)
topic_words = [[words for words, _ in model.get_topic(topic)] for topic in range(len(set(topics))-1)]


In [138]:
score = CoherenceModel(
    topics = topic_words,  
    texts=tokens, 
    dictionary=dictionary, 
    coherence='u_mass',
    topn = 10,
)

print('Coherence Score: ', round(score.get_coherence(), 5))

Coherence Score:  -0.46817


In [None]:
model.visualize_barchart(
    title="Topics",
    top_n_topics= 20,
    n_words = 10
)