In [2]:
# Importamos librerias que se usarán

import pandas as pd
import warnings
import re # Regular expression operations. Provides regular expressssion matching operations

warnings.filterwarnings("ignore")

In [None]:
# anchos_de_columna = [9, 16, 16, 350]

# column_names = [
#     "cluster",
#     "cantidad_de_palabras_clave",
#     "porcentaje_de_palabras_clave",
#     "principales_palabras_clave",
# ]

# archivo_clusters = "clusters_report.txt"

In [7]:
# Leer el archivo con el formato de ancho fijo
cluster_df = pd.read_fwf(
    # read_fwf() reads fixed width files.
    "clusters_report.txt",
    widths=[9, 16, 16, 350],
    header=None,
    names=[
    "cluster",
    "cantidad_de_palabras_clave",
    "porcentaje_de_palabras_clave",
    "principales_palabras_clave",
],
)

cluster_df

Unnamed: 0,cluster,cantidad_de_palabras_clave,porcentaje_de_palabras_clave,principales_palabras_clave
0,Cluster,Cantidad de,Porcentaje de,Principales palabras clave
1,,palabras clave,palabras clave,
2,---------,----------------,----------------,----------------------------------------------...
3,1,105,"15,9 %","maximum power point tracking, fuzzy-logic base..."
4,,,,"photo-voltaic system, differential evoluti..."
5,,,,"algorithm, double-fed induction generator (dfi..."
6,,,,"photo voltaic array, firefly algorithm, partia..."
7,2,102,"15,4 %","support vector machine, long short-term mem..."
8,,,,"network, convolution neural network, speed..."
9,,,,"consumption, wind power forecasting, e..."


In [8]:
# Eliminar las primeras filas innecesarias
cluster_df = cluster_df.drop([0, 1, 2])

# Resetear los índices del DataFrame
cluster_df = cluster_df.reset_index(drop=True)

cluster_df

Unnamed: 0,cluster,cantidad_de_palabras_clave,porcentaje_de_palabras_clave,principales_palabras_clave
0,1.0,105.0,"15,9 %","maximum power point tracking, fuzzy-logic base..."
1,,,,"photo-voltaic system, differential evoluti..."
2,,,,"algorithm, double-fed induction generator (dfi..."
3,,,,"photo voltaic array, firefly algorithm, partia..."
4,2.0,102.0,"15,4 %","support vector machine, long short-term mem..."
5,,,,"network, convolution neural network, speed..."
6,,,,"consumption, wind power forecasting, e..."
7,,,,"recurrent-neural-network (rnn), radial basis f..."
8,,,,farm.
9,3.0,89.0,"13,4 %","smart grid, wind power, reinforcement learning..."


In [9]:
# Convertir 'principales_palabras_clave' a tipo str
cluster_df["principales_palabras_clave"] = cluster_df[
    "principales_palabras_clave"
].astype(str)

cluster_df

Unnamed: 0,cluster,cantidad_de_palabras_clave,porcentaje_de_palabras_clave,principales_palabras_clave
0,1.0,105.0,"15,9 %","maximum power point tracking, fuzzy-logic base..."
1,,,,"photo-voltaic system, differential evoluti..."
2,,,,"algorithm, double-fed induction generator (dfi..."
3,,,,"photo voltaic array, firefly algorithm, partia..."
4,2.0,102.0,"15,4 %","support vector machine, long short-term mem..."
5,,,,"network, convolution neural network, speed..."
6,,,,"consumption, wind power forecasting, e..."
7,,,,"recurrent-neural-network (rnn), radial basis f..."
8,,,,farm.
9,3.0,89.0,"13,4 %","smart grid, wind power, reinforcement learning..."


In [10]:
# Reemplazar '%' con '', y ',' con '.' en 'porcentaje_de_palabras_clave' y convertirlo a tipo float
cluster_df["porcentaje_de_palabras_clave"] = (
    cluster_df["porcentaje_de_palabras_clave"]
    .str.replace("%", "")
    .str.replace(",", ".")
    .astype(float)
)

cluster_df

Unnamed: 0,cluster,cantidad_de_palabras_clave,porcentaje_de_palabras_clave,principales_palabras_clave
0,1.0,105.0,15.9,"maximum power point tracking, fuzzy-logic base..."
1,,,,"photo-voltaic system, differential evoluti..."
2,,,,"algorithm, double-fed induction generator (dfi..."
3,,,,"photo voltaic array, firefly algorithm, partia..."
4,2.0,102.0,15.4,"support vector machine, long short-term mem..."
5,,,,"network, convolution neural network, speed..."
6,,,,"consumption, wind power forecasting, e..."
7,,,,"recurrent-neural-network (rnn), radial basis f..."
8,,,,farm.
9,3.0,89.0,13.4,"smart grid, wind power, reinforcement learning..."


In [11]:
# Llenar los valores vacíos en 'cluster', 'cantidad_de_palabras_clave' y 'porcentaje_de_palabras_clave'
cluster_df["cluster"] = cluster_df["cluster"].fillna(method="pad")
cluster_df["cantidad_de_palabras_clave"] = cluster_df[
    "cantidad_de_palabras_clave"
].fillna(method="pad")
cluster_df["porcentaje_de_palabras_clave"] = cluster_df[
    "porcentaje_de_palabras_clave"
].fillna(method="pad")
# estamos utilizando el método "pad" para rellenar los valores faltantes hacia adelante. 
# Esto significa que cualquier valor NaN se reemplazará por el último valor válido en la misma columna antes de él.
cluster_df

Unnamed: 0,cluster,cantidad_de_palabras_clave,porcentaje_de_palabras_clave,principales_palabras_clave
0,1,105,15.9,"maximum power point tracking, fuzzy-logic base..."
1,1,105,15.9,"photo-voltaic system, differential evoluti..."
2,1,105,15.9,"algorithm, double-fed induction generator (dfi..."
3,1,105,15.9,"photo voltaic array, firefly algorithm, partia..."
4,2,102,15.4,"support vector machine, long short-term mem..."
5,2,102,15.4,"network, convolution neural network, speed..."
6,2,102,15.4,"consumption, wind power forecasting, e..."
7,2,102,15.4,"recurrent-neural-network (rnn), radial basis f..."
8,2,102,15.4,farm.
9,3,89,13.4,"smart grid, wind power, reinforcement learning..."


In [12]:
def join_words(words):
    return ' '.join(word.strip() for word in words if pd.notna(word))

#  antes de unir las palabras, elimina los espacios en blanco alrededor de cada palabra y filtra las palabras que no son NaN

In [13]:
# Agrupar y juntar palabras clave
clusters_report_df = (
    cluster_df.groupby(
        ["cluster", "cantidad_de_palabras_clave", "porcentaje_de_palabras_clave"],
        dropna=False,
    )["principales_palabras_clave"]
    .agg(join_words)
    .reset_index()
)

clusters_report_df

Unnamed: 0,cluster,cantidad_de_palabras_clave,porcentaje_de_palabras_clave,principales_palabras_clave
0,1,105,15.9,"maximum power point tracking, fuzzy-logic base..."
1,10,27,4.1,"micro grid, multi-agent systems, distribute..."
2,11,22,3.3,"hydrogen, biochar, biomass, biogas, microb..."
3,12,22,3.3,"state of charge (soc) estimation, radial bas..."
4,13,17,2.6,"pem fuel cell, solid-oxide fuel cell, ..."
5,2,102,15.4,"support vector machine, long short-term mem..."
6,3,89,13.4,"smart grid, wind power, reinforcement learning..."
7,4,60,9.1,"wind turbine, fault diagnosis, biodi..."
8,5,52,7.9,"electric vehicle, lithium-ion batteries, state..."
9,6,51,7.7,"particle swarm optimization, distribute ge..."


In [14]:
clusters_report_df["cluster"] = clusters_report_df["cluster"].astype(int)
clusters_report_df = clusters_report_df.sort_values(by="cluster")
clusters_report_df["principales_palabras_clave"] = (
    clusters_report_df["principales_palabras_clave"]
    .astype(str)
    .str.replace(",, ", ", ")
    .str.replace("   ", " ")
    .str.replace("  ", " ")
    .str.replace("hydrogen  production,  numeric", "hydrogen production, numeric")
    .str.replace(",  ", ", ")
    .str.replace(".", "")
    .str.lower()
)

clusters_report_df

Unnamed: 0,cluster,cantidad_de_palabras_clave,porcentaje_de_palabras_clave,principales_palabras_clave
0,1,105,15.9,"maximum power point tracking, fuzzy-logic base..."
5,2,102,15.4,"support vector machine, long short-term memory..."
6,3,89,13.4,"smart grid, wind power, reinforcement learning..."
7,4,60,9.1,"wind turbine, fault diagnosis, biodiesel, fail..."
8,5,52,7.9,"electric vehicle, lithium-ion batteries, state..."
9,6,51,7.7,"particle swarm optimization, distribute genera..."
10,7,42,6.3,"multi-objective optimization, energy storage, ..."
11,8,38,5.7,"genetic algorithm, demand-side management, ene..."
12,9,35,5.3,"anfis, global solar irradiance, solar irradian..."
1,10,27,4.1,"micro grid, multi-agent systems, distributed e..."


In [15]:
clusters_report_df['principales_palabras_clave'] = clusters_report_df['principales_palabras_clave'].apply(lambda x: re.sub(r'([a-zA-Z]),([a-zA-Z])', r'\1, \2', x))
clusters_report_df["cantidad_de_palabras_clave"] = clusters_report_df[
    "cantidad_de_palabras_clave"
].astype(int)

clusters_report_df

Unnamed: 0,cluster,cantidad_de_palabras_clave,porcentaje_de_palabras_clave,principales_palabras_clave
0,1,105,15.9,"maximum power point tracking, fuzzy-logic base..."
5,2,102,15.4,"support vector machine, long short-term memory..."
6,3,89,13.4,"smart grid, wind power, reinforcement learning..."
7,4,60,9.1,"wind turbine, fault diagnosis, biodiesel, fail..."
8,5,52,7.9,"electric vehicle, lithium-ion batteries, state..."
9,6,51,7.7,"particle swarm optimization, distribute genera..."
10,7,42,6.3,"multi-objective optimization, energy storage, ..."
11,8,38,5.7,"genetic algorithm, demand-side management, ene..."
12,9,35,5.3,"anfis, global solar irradiance, solar irradian..."
1,10,27,4.1,"micro grid, multi-agent systems, distributed e..."
