In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import random
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

In [5]:
# Load dataset, split into training and testing sets, and save the test set to a new CSV file.
data = pd.read_csv('Data_set_contient_12K_article.csv')
df_partie1, data = train_test_split(data, test_size=0.3, random_state=42)
data.to_csv('dataset_.csv', index=False)

In [8]:
# Vectorize the text data with CountVectorizer and get the number of unique features (terms).
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
document_term_matrix = vectorizer.fit_transform(data['Article'])
len(vectorizer.get_feature_names_out())


32330

In [9]:
# Print 10 randomly selected words from the feature names in the CountVectorizer.
for i in range(10):
    random_word_id = random.randint(0,26923)
    print(vectorizer.get_feature_names_out()[random_word_id])

287
lópez
lukewarm
raise
convinced
lawful
alarmists
places
obligated
mercedes


In [10]:
# Fit a Latent Dirichlet Allocation (LDA) model with 7 topics to the document-term matrix.
LDA = LatentDirichletAllocation(n_components=7,random_state=42)
LDA.fit(document_term_matrix)

In [11]:
len(LDA.components_)
LDA.components_

array([[2.32410600e+00, 1.06272939e+02, 1.42857178e-01, ...,
        1.01426076e+01, 1.30561702e+01, 1.44036872e-01],
       [2.27941802e+00, 3.77625074e+02, 1.43177748e-01, ...,
        1.43275929e-01, 1.43566476e-01, 1.42857152e-01],
       [4.40573324e+00, 8.13159692e+01, 1.45199280e-01, ...,
        1.43026829e-01, 1.43220521e-01, 3.14288327e+00],
       ...,
       [5.69524848e+00, 4.05371548e+02, 2.10627284e+00, ...,
        1.42857147e-01, 3.44652036e+00, 1.42857154e-01],
       [3.00833700e+00, 1.11430785e+02, 1.44110655e-01, ...,
        1.42947302e-01, 1.43020287e-01, 1.13655668e+00],
       [1.43209988e-01, 4.62518361e+02, 1.75525087e-01, ...,
        6.14241289e+00, 1.69234358e+01, 1.47951711e-01]])

In [12]:
len(LDA.components_) , len(LDA.components_[0])

(7, 32330)

In [13]:
# Get the indices of the words sorted by their importance in the first topic of the LDA model.
single_topic = LDA.components_[0]
single_topic.argsort()

array([ 8100, 25524, 19052, ..., 22389, 25206, 29888], dtype=int64)

In [15]:
# Print the top 10 most important words in the first topic of the LDA model.
single_topic.argsort()[-10:]
for index in single_topic.argsort()[-10:]:
    print(vectorizer.get_feature_names_out()[index])

russia
administration
campaign
says
new
obama
house
president
said
trump


In [16]:
# Print the top 15 words for each topic identified by the LDA model.
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['country', 'committee', 'security', 'white', 'government', 'russia', 'administration', 'campaign', 'says', 'new', 'obama', 'house', 'president', 'said', 'trump']


THE TOP 15 WORDS FOR TOPIC #1
['year', 'insurance', 'new', 'states', 'students', 'school', 'care', 'federal', 'court', 'state', 'health', 'law', 'said', 'people', 'says']


THE TOP 15 WORDS FOR TOPIC #2
['sanders', 'don', 'know', 'voters', 'party', 'going', 'campaign', 'women', 'just', 'like', 'think', 'clinton', 'people', 'said', 'trump']


THE TOP 15 WORDS FOR TOPIC #3
['officer', 'shooting', 'city', 'video', 'time', 'team', 'news', 'officers', 'gun', 'case', 'people', 'told', 'says', 'said', 'police']


THE TOP 15 WORDS FOR TOPIC #4
['npr', 'killed', 'told', '000', 'country', 'attack', 'war', 'according', 'military', 'city', 'government', 'reports', 'says', 'people', 'said']


THE TOP 15 WORDS FOR TOPIC #5
['make', 'think', 'world', 'music', 'really', 'years', 'life', 'know', 'way', 'people'

In [17]:
# Get the shape of the matrix showing topic distributions for each document.
topic_results = LDA.transform(document_term_matrix)
topic_results.shape

(3598, 7)

In [20]:
topic_results[0].round(1)

array([0. , 0.8, 0. , 0. , 0. , 0.1, 0.1])

In [21]:
# Get the most likely topic for each document based on the topic distributions.
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 4, 4, 3], dtype=int64)

In [22]:
# Assign the most likely topic to each document and display the first 10 rows of the dataset.
data['Topic'] = topic_results.argmax(axis=1)
data.head(10)

Unnamed: 0,Article,Topic
10413,The unemployment rate for transgender people i...,1
1369,Lawyers for a transgender student and the G...,1
1718,The man at the heart of the legal resistance t...,1
7517,In the 23 years that Starflyer 59 has been a b...,5
360,It was the first day of school for Dan Lear’s ...,5
10372,Americans can watch a documentary about China ...,5
2232,"Earlier this winter, photographer Michael Furt...",6
10698,Alice Callaghan has spent decades working with...,1
10953,Editor’s Note: NPR’s Kara Frame made this shor...,5
10010,We don’t really know what Donald Trump paid in...,0
