In [13]:
import pandas as pd
import ast
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import altair as alt
from sklearn.manifold import TSNE
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from pyvis.network import Network

In [2]:
# Load your CSV data
data = pd.read_csv('elasticsearch_data.csv')

In [4]:
categories_list = [eval(categories) if isinstance(categories, str) else [] for categories in data['category']]

# Function to clean up categories and keep only the string values
def clean_categories(category_list):
    cleaned_categories = [category.strip("[]'") for category in category_list]
    return cleaned_categories

# Apply the clean_categories function to each row
cleaned_categories_list = [clean_categories(categories) for categories in categories_list]

# Initialize MultiLabelBinarizer to create a binary matrix for categories
mlb = MultiLabelBinarizer()
categories_matrix = mlb.fit_transform(cleaned_categories_list)

# Convert the binary matrix to a DataFrame
categories_df = pd.DataFrame(categories_matrix, columns=mlb.classes_)

# Concatenate the new categories DataFrame with the original data
data = pd.concat([data, categories_df], axis=1)

# Print the cleaned DataFrame
data['category']

0       ['Filosofía', 'Política', 'Filosofía y Política']
1       ['Filosofía', 'Política', 'Filosofía y Política']
2       ['Filosofía', 'Política', 'Filosofía y Política']
3       ['Filosofía', 'Política', 'Filosofía y Política']
4       ['Sociología', 'Economía', 'Sociología y Econo...
                              ...                        
1688    ['Otras Actividades Académicas', 'Actos públic...
1689    ['Otras Actividades Académicas', 'Actos públic...
1690    ['Otras Actividades Académicas', 'Actos públic...
1691    ['Otras Actividades Académicas', 'Noticiario a...
1692    ['Otras Actividades Académicas', 'Noticiario a...
Name: category, Length: 1693, dtype: object

In [5]:

# Split multi-category entries into separate rows
data_expanded = data.explode('category')

# Extract the categories as text
data_expanded['category_text'] = data_expanded['category']

# Reset the index to avoid duplicate index values
data_expanded.reset_index(drop=True, inplace=True)

# Initialize MultiLabelBinarizer to create a binary matrix for categories
mlb = MultiLabelBinarizer()
categories_matrix = mlb.fit_transform(data_expanded['category'])

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data_expanded['category_text'])

# Perform t-SNE for dimensionality reduction using TF-IDF matrix with init="random"
tsne = TSNE(n_components=2, random_state=0, init="random")
reduced_features = tsne.fit_transform(tfidf_matrix)

# Create a DataFrame with reduced features and category labels
tsne_df = pd.DataFrame(reduced_features, columns=['t-SNE1', 't-SNE2'])
tsne_df['category'] = data_expanded['category_text']

# Create the Altair scatter plot
scatter = alt.Chart(tsne_df).mark_circle().encode(
    x='t-SNE1',
    y='t-SNE2',
    color=alt.Color('category:N', scale=alt.Scale(scheme='category20')),
    tooltip=['category', 't-SNE1', 't-SNE2']
).properties(
    title='Category Relationships Network Graph (Altair)'
)

# Show the plot
scatter.interactive()

In [12]:
# Create an empty DataFrame for the new data
new_data = pd.DataFrame(columns=['source', 'target'])

# Iterate through each row in the existing DataFrame
for index, row in data.iterrows():
    categories = row['category'].strip("[]").replace("'", "").split(', ')
    title = row['title']
    # Add a row for each individual category
    for category in categories:
        new_data = new_data.append({'source': title, 'target': category}, ignore_index=True)

# Save the new data to a CSV file
new_data.to_csv('categories_target.csv', index=False)

In [8]:
data['category']

0       ['Filosofía', 'Política', 'Filosofía y Política']
1       ['Filosofía', 'Política', 'Filosofía y Política']
2       ['Filosofía', 'Política', 'Filosofía y Política']
3       ['Filosofía', 'Política', 'Filosofía y Política']
4       ['Sociología', 'Economía', 'Sociología y Econo...
                              ...                        
1688    ['Otras Actividades Académicas', 'Actos públic...
1689    ['Otras Actividades Académicas', 'Actos públic...
1690    ['Otras Actividades Académicas', 'Actos públic...
1691    ['Otras Actividades Académicas', 'Noticiario a...
1692    ['Otras Actividades Académicas', 'Noticiario a...
Name: category, Length: 1693, dtype: object

In [15]:
# Load the relations dataset
relations_df = pd.read_csv('categories_target.csv')

# Create a PyVis network object
net = Network(notebook=True, width=1000, height=600)

# Add nodes for articles and categories
for _, row in relations_df.iterrows():
    source = row['source']
    target = row['target']
    
    # Add nodes for articles and categories
    net.add_node(source, title=source)
    net.add_node(target)
    
    # Add an edge between the article and its category
    net.add_edge(source, target)

# Show the network graph
net.show('article_category_boe_relations.html')

article_category_boe_relations.html
