In [4]:
# import packages 
from newscollector import *
import pickle
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from transformers.pipelines import pipeline
from bertopic.representation import TextGeneration
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import ZeroShotClassification
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster

In [6]:
# read data 
df=pd.read_csv("data/data_w_topics.csv")
df.head()

Unnamed: 0,title,Topic,article,keywords
0,"Live updates: Israel-Hamas war rages, hostages...",israel conflict,"Bill Burns, director of the CIA, is expected t...","['expected', 'war', 'held', 'deal', 'rages', '..."
1,"January 25, 2024 Israel-Hamas war",israel conflict,"Palestinians flee Khan Younis, moving toward R...","['war', 'hamas', '25', 'sides', 'reported', 't..."
2,January 25 - 2024 campaign updates,election,Former President Donald Trump arrives for a ca...,"['campaign', '25', 'primary', 'republican', 'n..."
3,Today’s new in 10 minutes,others,"CNN —\n\nJanuary 26, 2024\n\nToday on CNN 10, ...","['cnn', 'yearly', 'average', 'minutes', 'today..."
4,U.S. and China are working to make the busines...,economy,"The flags of China, U.S. and the Chinese Commu...","['trade', 'foreign', 'rules', 'beijing', 'busi..."


In [7]:
# encoding labels since we are dealing with some form of supervised learning here. 

## remember to convert to list. 
y = list(df['Topic']) #list of articles
docs=list(df["article"])
label_encoder = LabelEncoder()
label_encoder.fit(y)

# Transform the categories to numerical labels STARTING WITH 1
numerical_labels = label_encoder.transform(y)+ 1

# Adjust the label mapping to start from 1
label_mapping = {class_: label+1 for class_, label in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}

print("Label mapping:", label_mapping)

Label mapping: {'economy': 1, 'election': 2, 'environment': 3, 'israel conflict': 4, 'others': 5}


## Supervised

In [8]:
# defining our parameters
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
representation_model = KeyBERTInspired()
embedding_model="all-MiniLM-L6-v2"
empty_dimensionality_model = BaseDimensionalityReduction() # remember, here is supervised, so no dimensionality reduction
clf = LogisticRegression() # we identify our classifier (note it is supervised) as logistic regression. 
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# Create a fully supervised BERTopic instance
topic_model= BERTopic(
        umap_model=empty_dimensionality_model,
        hdbscan_model=clf,
        ctfidf_model=ctfidf_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model
)
topics, probs = topic_model.fit_transform(docs,y=numerical_labels)

In [9]:
# Map input `y` to topics
mappings = topic_model.topic_mapper_.get_mappings()
mapping_update={key: mappings[value] for key, value in label_mapping.items()}
mapping_update={value: key for key, value in mapping_update.items()}
data = topic_model.get_topic_info()
data['mapped_topic'] = data['Topic'].replace(mapping_update)

In [10]:
mappings

{1: 0, 2: 4, 3: 2, 4: 3, 5: 1}

In [11]:
data

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,mapped_topic
0,0,30,0_markets_investment_financial_securities,"[markets, investment, financial, securities, i...","[It’s a tale of two worlds for Intel INTC, +0....",economy
1,1,17,1_nairobi_kenya_awards_nivi,"[nairobi, kenya, awards, nivi, author, taiwan,...",[Economy Penalties for illegal State actions b...,others
2,2,6,2_epa_regulation_regulations_emissions,"[epa, regulation, regulations, emissions, poli...",[Columnists Be careful with clean energy hype\...,environment
3,3,2,3_israel hamas_officials israel_hamas_journal ...,"[israel hamas, officials israel, hamas, journa...","[Bill Burns, director of the CIA, is expected ...",israel conflict
4,4,1,4_nonincumbent republican win_republican win n...,"[nonincumbent republican win, republican win n...",[Former President Donald Trump arrives for a c...,election


In [12]:
topic_model.get_topic(0)

[('markets', 0.45210153),
 ('investment', 0.43668854),
 ('financial', 0.4357568),
 ('securities', 0.428281),
 ('investors', 0.42305923),
 ('market', 0.41843542),
 ('intel', 0.411424),
 ('economy', 0.39748937),
 ('intels', 0.3917294),
 ('earnings', 0.3893836)]

In [13]:
topic_model.get_topic(1)

[('nairobi', 0.3226037),
 ('kenya', 0.31622916),
 ('awards', 0.2732029),
 ('nivi', 0.26742905),
 ('author', 0.26639372),
 ('taiwan', 0.2637592),
 ('billion', 0.26189375),
 ('yony', 0.2552439),
 ('jaboya', 0.23051254),
 ('taiwans', 0.22368357)]

In [14]:
topic_model.get_topic(2)

[('epa', 0.38069355),
 ('regulation', 0.3741216),
 ('regulations', 0.36402416),
 ('emissions', 0.32819748),
 ('policy', 0.3106893),
 ('enforcement', 0.30061418),
 ('eu countries', 0.29115203),
 ('procurement', 0.27804562),
 ('eu', 0.27637425),
 ('kenya', 0.27382788)]

In [16]:
topic_model.get_topic(3)

[('israel hamas', 0.57358),
 ('officials israel', 0.5479965),
 ('hamas', 0.5462495),
 ('journal said hamas', 0.5279148),
 ('operation southern gaza', 0.5278547),
 ('gaza', 0.52602965),
 ('palestinian prisoners', 0.51421136),
 ('palestinian prisoners held', 0.5125932),
 ('officials israel egypt', 0.51139164),
 ('know palestinians flee', 0.5068967)]

In [17]:
topic_model.get_topic(4)

[('nonincumbent republican win', 0.59576535),
 ('republican win new', 0.59168136),
 ('second nonincumbent republican', 0.58159184),
 ('winning new hampshire', 0.5335259),
 ('win new hampshire', 0.5164),
 ('nonincumbent republican', 0.51142776),
 ('nonincumbent new hampshire', 0.4993483),
 ('gop primary', 0.46652058),
 ('new hampshire', 0.45285442),
 ('iowa new hampshire', 0.44846272)]

## Manual topic Modeling

In [18]:

# Prepare our empty sub-models and reduce frequent words while we are at it.
empty_embedding_model = BaseEmbedder() # remember to specify to base so we can perform the task of manual modeling
empty_cluster_model = BaseCluster() # remember to specify this to base (ie none) so we can perform the task of manual modeling
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=empty_embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model,
    vectorizer_model=vectorizer_model
)
topics, probs = topic_model.fit_transform(docs,y=numerical_labels)

In [19]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,30,0_intel_year_said_market,"[intel, year, said, market, billion, company, ...","[It’s a tale of two worlds for Intel INTC, +0...."
1,1,17,1_taiwan_women_million_billion,"[taiwan, women, million, billion, yony, people...",[Economy Penalties for illegal State actions b...
2,2,6,2_coffee_gas_tfl_parking,"[coffee, gas, tfl, parking, euro, drivers, eur...",[Columnists Be careful with clean energy hype\...
3,3,2,3_hamas_israel_hostages_gaza,"[hamas, israel, hostages, gaza, held, israeli,...","[Bill Burns, director of the CIA, is expected ..."
4,4,1,4_new hampshire_hampshire_trump_iowa,"[new hampshire, hampshire, trump, iowa, republ...",[Former President Donald Trump arrives for a c...


In [20]:
# Map input `y` to topics
mappings = topic_model.topic_mapper_.get_mappings()
mapping_update={key: mappings[value] for key, value in label_mapping.items()}
mapping_update={value: key for key, value in mapping_update.items()}
data = topic_model.get_topic_info()
data['mapped_topic'] = data['Topic'].replace(mapping_update)

In [21]:
data

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,mapped_topic
0,0,30,0_intel_year_said_market,"[intel, year, said, market, billion, company, ...","[It’s a tale of two worlds for Intel INTC, +0....",economy
1,1,17,1_taiwan_women_million_billion,"[taiwan, women, million, billion, yony, people...",[Economy Penalties for illegal State actions b...,others
2,2,6,2_coffee_gas_tfl_parking,"[coffee, gas, tfl, parking, euro, drivers, eur...",[Columnists Be careful with clean energy hype\...,environment
3,3,2,3_hamas_israel_hostages_gaza,"[hamas, israel, hostages, gaza, held, israeli,...","[Bill Burns, director of the CIA, is expected ...",israel conflict
4,4,1,4_new hampshire_hampshire_trump_iowa,"[new hampshire, hampshire, trump, iowa, republ...",[Former President Donald Trump arrives for a c...,election


## Semi-supervised 

In [22]:
# reading the unlabelled proportion of our data. 
df2=(pd.read_csv("data/data.csv")[len(df):]).reset_index(drop=True)
df2.head

<bound method NDFrame.head of                                                 title  \
0   Superdry loses fourth finance boss in five yea...   
1   UK homeowners with mental health problems ‘spe...   
2   Car insurance: drivers paying monthly ‘face in...   
3   Indonesian fruit picker landed in debt bondage...   
4   The joy of CeX: how to spend £10 in the second...   
5   Domestic wood burners having a deadly impact i...   
6   ‘The world is changing too fast for us’: organ...   
7   Michelle Mone and the PPE Medpro investigation...   
8   Energy-efficient homes for sale in England – i...   
9   Private rents in Great Britain hit record high...   
10  Bernie Ecclestone's £650m fraud payout 'made h...   
11  Rents hit record high but signs of a slowdown ...   
12  Kaman Aerospace: Why The Take Over Is The Best...   
13  Canadian National Railway: A Great Buy On Weak...   
14  KPN Stock: The 5.4% Yield Will Increase By A 7...   
15  PEO Fund: A Buy The Dip Opportunity In This 6%...   
1

In [23]:
# labeling -1 for our unlabelled data
df2["Topic"]=[None for i in range(len(df2))]
numerical_labels=list(numerical_labels)+[-1 for i in range(len(df2))]
df=pd.concat([df,df2]).reset_index(drop=True)
docs=list(df["article"])

In [24]:
# again defining the parameters
embedding_model="all-MiniLM-L6-v2"
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
# Create a fully supervised BERTopic instance
topic_model= BERTopic(
        umap_model=empty_dimensionality_model,
        hdbscan_model=clf,
        ctfidf_model=ctfidf_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model
)
topics, probs = topic_model.fit_transform(docs,y=numerical_labels)

In [25]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,48,-1_investment_stocks_investors_stock,"[investment, stocks, investors, stock, markets...",[An investor waiting for AT&T to declare a div...
1,0,30,0_markets_investment_financial_securities,"[markets, investment, financial, securities, i...","[It’s a tale of two worlds for Intel INTC, +0...."
2,1,17,1_nairobi_kenya_awards_nivi,"[nairobi, kenya, awards, nivi, author, taiwan,...",[Economy Penalties for illegal State actions b...
3,2,6,2_epa_regulation_regulations_emissions,"[epa, regulation, regulations, emissions, coff...",[Columnists Be careful with clean energy hype\...
4,3,2,3_israel hamas_officials israel_hamas_journal ...,"[israel hamas, officials israel, hamas, journa...","[Bill Burns, director of the CIA, is expected ..."
5,4,1,4_nonincumbent republican win_republican win n...,"[nonincumbent republican win, republican win n...",[Former President Donald Trump arrives for a c...


In [26]:
mappings = topic_model.topic_mapper_.get_mappings()
mappings

{-1: -1, 1: 0, 2: 4, 3: 2, 4: 3, 5: 1}

In [27]:
set(numerical_labels)

{-1, 1, 2, 3, 4, 5}

## Zero-shot Topic Modelling

In [28]:
# We define a number of topics that we know are in the documents
zeroshot_topic_list = ["Economy", "election", "environment","israel conflict","others"]

# We fit our model using the zero-shot topics
# and we define a minimum similarity. For each document,
# if the similarity does not exceed that value, it will be used
# for clustering instead.
topic_model = BERTopic(
    embedding_model="allenai/longformer-base-4096", 
    min_topic_size=5,
    zeroshot_topic_list=zeroshot_topic_list,
    zeroshot_min_similarity=.95,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model
)
topics, _ = topic_model.fit_transform(docs)

No sentence-transformers model found with name C:\Users\charl/.cache\torch\sentence_transformers\allenai_longformer-base-4096. Creating a new one with MEAN pooling.


In [29]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5,-1_climate change energy_related supply disrup...,"[climate change energy, related supply disrupt...","[I'm bullish on lithium. For starters, I have ..."
1,0,27,0_artificial intelligence_15 billion_fourth qu...,"[artificial intelligence, 15 billion, fourth q...",[(Bloomberg) -- Intel Corp. tumbled in premark...
2,1,20,1_tax incentives exemptions_union workforce am...,"[tax incentives exemptions, union workforce am...",[After a portion of an Alaska Airlines Boeing ...
3,2,19,2_spokesperson_pollution_euro parking_london,"[spokesperson, pollution, euro parking, london...",[Pierre Bretagne woke at 4am to feed the cows ...
4,3,17,3_free cash flow_long term_share price_cash flow,"[free cash flow, long term, share price, cash ...",[ilbusca\n\nAll financial numbers in this arti...
5,4,16,4_nairobi_deposits_natural gas_infrastructure,"[nairobi, deposits, natural gas, infrastructur...",[Economy Nairobi collects more revenue than 30...


In [30]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,"Bill Burns, director of the CIA, is expected t...",1,1_tax incentives exemptions_union workforce am...,"[tax incentives exemptions, union workforce am...",[After a portion of an Alaska Airlines Boeing ...,tax incentives exemptions - union workforce am...,0.617899,False
1,"Palestinians flee Khan Younis, moving toward R...",0,0_artificial intelligence_15 billion_fourth qu...,"[artificial intelligence, 15 billion, fourth q...",[(Bloomberg) -- Intel Corp. tumbled in premark...,artificial intelligence - 15 billion - fourth ...,1.000000,False
2,Former President Donald Trump arrives for a ca...,0,0_artificial intelligence_15 billion_fourth qu...,"[artificial intelligence, 15 billion, fourth q...",[(Bloomberg) -- Intel Corp. tumbled in premark...,artificial intelligence - 15 billion - fourth ...,0.902057,False
3,"CNN —\n\nJanuary 26, 2024\n\nToday on CNN 10, ...",-1,-1_climate change energy_related supply disrup...,"[climate change energy, related supply disrupt...","[I'm bullish on lithium. For starters, I have ...",climate change energy - related supply disrupt...,0.000000,True
4,"The flags of China, U.S. and the Chinese Commu...",0,0_artificial intelligence_15 billion_fourth qu...,"[artificial intelligence, 15 billion, fourth q...",[(Bloomberg) -- Intel Corp. tumbled in premark...,artificial intelligence - 15 billion - fourth ...,1.000000,False
...,...,...,...,...,...,...,...,...
99,Intel (INTC) stock fell more than 10% in prema...,0,0_artificial intelligence_15 billion_fourth qu...,"[artificial intelligence, 15 billion, fourth q...",[(Bloomberg) -- Intel Corp. tumbled in premark...,artificial intelligence - 15 billion - fourth ...,1.000000,False
100,(Bloomberg) -- Intel Corp. tumbled in premarke...,0,0_artificial intelligence_15 billion_fourth qu...,"[artificial intelligence, 15 billion, fourth q...",[(Bloomberg) -- Intel Corp. tumbled in premark...,artificial intelligence - 15 billion - fourth ...,1.000000,True
101,"Over the last year, a big contributor to the m...",3,3_free cash flow_long term_share price_cash flow,"[free cash flow, long term, share price, cash ...",[ilbusca\n\nAll financial numbers in this arti...,free cash flow - long term - share price - cas...,0.563779,False
102,The Dow will surge 24% after the Fed's first r...,0,0_artificial intelligence_15 billion_fourth qu...,"[artificial intelligence, 15 billion, fourth q...",[(Bloomberg) -- Intel Corp. tumbled in premark...,artificial intelligence - 15 billion - fourth ...,1.000000,False


In [31]:
topic_model.get_topic(0)

[('artificial intelligence', 0.89040095),
 ('15 billion', 0.89020073),
 ('fourth quarter', 0.8841031),
 ('data center', 0.88351345),
 ('leadership', 0.88178396),
 ('economy', 0.88142896),
 ('new hampshire', 0.88131976),
 ('2024', 0.88041127),
 ('foundry', 0.87888867),
 ('marketwatch', 0.8784417)]

In [32]:
topic_model.get_topic(1)

[('tax incentives exemptions', 0.9035964),
 ('union workforce amid', 0.8962159),
 ('emerging markets', 0.8934764),
 ('incentives exemptions handed', 0.89329815),
 ('incentives exemptions', 0.8889323),
 ('social media', 0.88751465),
 ('alaska airlines', 0.8864667),
 ('include management administrative', 0.8864326),
 ('strong support', 0.8847857),
 ('union workforce', 0.88338476)]

In [33]:
topic_model.get_topic(2)

[('spokesperson', 0.8965019),
 ('pollution', 0.89487827),
 ('euro parking', 0.89219743),
 ('london', 0.88982636),
 ('parking', 0.8889665),
 ('costs', 0.8879312),
 ('2024', 0.8871969),
 ('privacy', 0.88706964),
 ('finance', 0.88648313),
 ('cookies', 0.88515997)]

In [34]:
topic_model.get_topic(3)

[('free cash flow', 0.9062358),
 ('long term', 0.8974307),
 ('share price', 0.8969448),
 ('cash flow', 0.8959423),
 ('corporation', 0.89566505),
 ('diversification', 0.8943981),
 ('free cash', 0.8935885),
 ('ntt corporation', 0.8889551),
 ('investment', 0.8880136),
 ('revenues', 0.88736236)]

In [35]:
topic_model.get_topic(4)

[('nairobi', 0.8903827),
 ('deposits', 0.88598293),
 ('natural gas', 0.88347024),
 ('infrastructure', 0.8816166),
 ('2024', 0.88074094),
 ('costs', 0.88072884),
 ('eurodollar', 0.8804434),
 ('africa', 0.8799159),
 ('investments', 0.8797959),
 ('patients', 0.87968594)]