In [11]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from joblib import dump

In [12]:
# carregando o arquivo com as diversas noticias
noticia = "texts_reuters.txt"

In [13]:
# Lendo o arquivo
with open(noticia, 'r', encoding='utf-8') as file:
    content = file.readlines()
    
# Removendo espaços em branco extras e novas linhas
textos = [texto.strip() for texto in textos if texto.strip()]

In [14]:
# Criar um DataFrame a partir do conteúdo
data = pd.DataFrame(content, columns=['text'])

In [15]:
# Converter textos para uma matriz TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data['text'])

# Inicializando e executando o K-Means
kmeans = KMeans(n_clusters=10, max_iter=10, random_state=42)
kmeans.fit(tfidf_matrix)

# Salvar o modelo para uso futuro
dump(kmeans, 'modelo_kmeans.joblib')

# Obter os centróides e etiquetas dos clusters
centroids = kmeans.cluster_centers_
labels = kmeans.labels_

# Vincular as etiquetas de cluster de volta aos dados originais para análise
data['cluster'] = labels
cluster_summary = data.groupby('cluster').apply(lambda df: df.sample(1))['text'].tolist()

cluster_summary

  super()._check_params_vs_input(X, default_n_init=10)


['Alexander Vinnik, a 38 year old Russian man suspected of running a money laundering operation using bitcoin, is escorted by police officers to a court in Athens, Greece December 13, 2017. REUTERS/Costas Baltas/File Photo Purchase Licensing Rights\n',
 'May 3, 20243:35 PM GMT-3Updated 18 hours ago\n',
 'The lawyer, Arkady Bukh, said that as a result of the plea bargain he now expected Vinnik to get a prison term of less than 10 years.\n',
 'By David Randall\n',
 "Argentina's President Javier Milei speaks during a business event in Buenos Aires, Argentina, March 26, 2024. REUTERS/Agustin Marcarian/File Photo Purchase Licensing Rights\n",
 "But stubbornly strong inflation has eroded prospects of rate cuts this year, and small cap stocks have suffered as a result. The Russell 2000 is up just 0.4% year-to-date, far less than the S&P 500's 7.5% gain. Earnings are also expected to be shaky, giving investors little reason to shift allocations from larger companies and other, less risky parts

In [4]:
# Vetorizando os textos com stop words em português
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(textos)

In [5]:
# Aplicando o K-means
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [6]:
# Mostrando os clusters
cluster_labels = kmeans.labels_
for i, label in enumerate(cluster_labels):
    print(f"Documento {i+1}, Cluster {label}")

Documento 1, Cluster 0
Documento 2, Cluster 1
Documento 3, Cluster 1
Documento 4, Cluster 0
Documento 5, Cluster 1
Documento 6, Cluster 1
Documento 7, Cluster 1
Documento 8, Cluster 1
Documento 9, Cluster 0
Documento 10, Cluster 0
Documento 11, Cluster 1
Documento 12, Cluster 0
Documento 13, Cluster 1
Documento 14, Cluster 0
Documento 15, Cluster 0
Documento 16, Cluster 0
Documento 17, Cluster 0
Documento 18, Cluster 0
Documento 19, Cluster 0
Documento 20, Cluster 0
Documento 21, Cluster 0
Documento 22, Cluster 0
Documento 23, Cluster 0
Documento 24, Cluster 0
Documento 25, Cluster 0
Documento 26, Cluster 0
Documento 27, Cluster 0
Documento 28, Cluster 0
Documento 29, Cluster 0
Documento 30, Cluster 0
Documento 31, Cluster 0
Documento 32, Cluster 0
Documento 33, Cluster 0
Documento 34, Cluster 0
Documento 35, Cluster 0
Documento 36, Cluster 0
Documento 37, Cluster 0
Documento 38, Cluster 0
Documento 39, Cluster 0
Documento 40, Cluster 0
Documento 41, Cluster 0
Documento 42, Cluster 0
D