In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Descargar stopwords
nltk.download('stopwords')

from google.colab import drive
from sklearn.utils import shuffle


# Carga los datos
drive.mount('/content/drive')
data_path1 = '/content/drive/MyDrive/Colab Notebooks/BIG DATA/data/NewsArticles/technology_data.csv'
df1 = pd.read_csv(data_path1)
data_path2 = '/content/drive/MyDrive/Colab Notebooks/BIG DATA/data/NewsArticles/sports_data.csv'
df2 = pd.read_csv(data_path2)
data_path3 = '/content/drive/MyDrive/Colab Notebooks/BIG DATA/data/NewsArticles/entertainment_data.csv'
df3 = pd.read_csv(data_path3)
data_path4 = '/content/drive/MyDrive/Colab Notebooks/BIG DATA/data/NewsArticles/education_data.csv'
df4 = pd.read_csv(data_path4)
data_path5 = '/content/drive/MyDrive/Colab Notebooks/BIG DATA/data/NewsArticles/business_data.csv'
df5 = pd.read_csv(data_path5)

dfs = [df1, df2, df3, df4, df5]

# Combinar DataFrames y mezclar
df = pd.concat(dfs)
df = shuffle(df)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Mounted at /content/drive


In [None]:
# Eliminar columnas innecesarias
df.drop(columns=[ "url"], inplace=True)

# Mapear categoría a valores numéricos
def map_category(category):
    categories = {'business': 0, 'entertainment': 1, 'sport': 2, 'tech': 3, 'politics': 4}
    return categories[category]
df['Category'] = df['category'].apply(map_category)

# Combinar columnas de texto
df['Content'] = df['headlines'] + ' ' + df['description'] + ' ' + df['content']
df.drop(['headlines', 'description', 'content'], axis=1, inplace=True)

# Stemming y eliminación de stopwords
def stemming(text):
    stemmer = PorterStemmer()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text
df['Content'] = df['Content'].apply(stemming)

In [None]:
# Separar en conjuntos de entrenamiento y prueba
X = df['Content']
Y = df['Category']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

# Vectorizar texto con TF-IDF
vc = TfidfVectorizer()
X_train = vc.fit_transform(X_train)
X_test = vc.transform(X_test)

In [None]:
# Entrenar modelo SVM
model = SVC()
model.fit(X_train, Y_train)

# Evaluar modelo
Y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("F1-score:", f1_score(Y_test, Y_pred, average='weighted'))
print(classification_report(Y_test, Y_pred))

In [None]:
def make_predictions(headlines, description, content):
    text = headlines + ' ' + description + ' ' + content
    text = stemming(text)
    text = vc.transform([text])
    prediction = model.predict(text)
    category = map_category(prediction[0])
    print("La categoría predicha es:", category)

In [None]:
# Ejemplo de uso
headlines = "Nuevo avance en inteligencia artificial"
description = "Investigadores desarrollan un nuevo algoritmo que mejora la precisión de los modelos de lenguaje."
content = "El equipo de investigación ha publicado un artículo que detalla el funcionamiento del algoritmo..."
make_predictions(headlines, description, content)