# Encontrar los artículos más relevantes para el proyecto de investigación

In [1]:
import bibtexparser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath('../'))
import scripts.eda_bib as eda_bib

In [2]:
# Definir las consultas para cada grupo de búsqueda
query_keywords = {
    'dropout_prediction': [
        "student dropout", "school dropout", "early warning systems", "dropout risk factors", 
        "dropout prevention", "student retention", "educational dropout prediction", "dropout analysis", "school"
    ],
    'moodle': [
        "moodle", "learning management systems", "virtual learning environments", "student engagement", 
        "Moodle analytics", "online learning platforms", "LMS engagement", "learning platform interaction", "school"
    ],
    'risk_detection': [
        "students at risk", "at-risk students", "early detection of academic risk", "early identification of student risk", 
        "academic risk factors", "predictive modeling for student risk", "risk prediction in education", 
        "early intervention in education", "school", "Predictive models", "Learning analytics"
    ],
    'student_performance_prediction': [
        "student performance", "academic success", "predicting academic performance", "student achievement prediction", 
        "performance analytics", "learning outcomes prediction", "student success models", "academic performance factors", "school"
    ],
    'processes': [
        "academic analytics", "decision support systems", "educational data mining", "data-driven decision making in education", 
        "process optimization in education", "data science in education", "student data processing", "learning analytics processes", "school"
    ]
}

In [3]:
file_paths = [
    '../resultados_consolidados/student_performance_prediction_consolidado.bib',
    '../resultados_consolidados/risk_detection_consolidado.bib',
    '../resultados_consolidados/dropout_prediction_consolidado.bib',
    '../resultados_consolidados/moodle_consolidado.bib',
    '../resultados_consolidados/processes_consolidado.bib'
]

In [5]:
# Preprocesar el texto (títulos y resúmenes)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /Users/diana/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/diana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/diana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/diana/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
def preprocesar_texto(texto):
    tokens = word_tokenize(texto.lower())
    tokens_limpios = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens_limpios)

In [8]:
def calcular_relevancia_por_keywords(df, keywords):
    """
    Calcula la relevancia de artículos basados en una lista de keywords y devuelve los resultados más relevantes.
    
    Parámetros:
    - df: DataFrame con los artículos (incluyendo 'Abstract', 'Title', 'Keywords').
    - keywords: Lista de palabras clave para calcular la relevancia.
    
    Retorna:
    - resultados_relevancia: Lista de artículos con su relevancia ordenada.
    """

    # Preprocesar los textos de los artículos, incluyendo 'Abstract', 'Title', y 'Keywords' si están presentes
    articulos_procesados = df.apply(lambda row: preprocesar_texto(
        (row['Abstract'] if pd.notna(row['Abstract']) else '') + " " +
        (row['Title'] if pd.notna(row['Title']) else '') + " " +
        (row['Keywords'] if pd.notna(row['Keywords']) else '')
    ), axis=1)

    # Validar el preprocesamiento de los textos (ver los primeros resultados)
    print("Primeros 5 artículos procesados:\n", articulos_procesados.head())

    # Ajustes del vectorizador TF-IDF con parámetros más amplios
    vectorizer = TfidfVectorizer(max_df=0.9, min_df=1, ngram_range=(1, 3), stop_words='english')  # Rango ngramas (1, 3) para capturar combinaciones
    tfidf_matrix = vectorizer.fit_transform(articulos_procesados)

    # Preprocesar las keywords
    keywords_procesadas = [preprocesar_texto(palabra) for palabra in keywords]
    consulta_procesada = " ".join(keywords_procesadas)
    
    # Validar el preprocesamiento de las keywords
    print("Palabras clave procesadas:", consulta_procesada)

    # Obtener la representación TF-IDF de la consulta
    consulta_tfidf = vectorizer.transform([consulta_procesada])
    
    # Calcular la similitud coseno entre la consulta y los artículos
    similaridades = cosine_similarity(tfidf_matrix, consulta_tfidf).flatten()

    # Ordenar los artículos por relevancia (similaridad más alta)
    articulos_ordenados = np.argsort(-similaridades)
    relevancia_articulos = similaridades[articulos_ordenados]

    # Filtrar los resultados más relevantes por un umbral (ajustar si es necesario)
    umbral = 0.05  # Bajamos el umbral para captar más artículos
    articulos_relevantes = [(df.iloc[idx], relevancia_articulos[i]) for i, idx in enumerate(articulos_ordenados) if relevancia_articulos[i] >= umbral][:10]

    # Si no hay suficientes artículos relevantes, imprime una advertencia
    if not articulos_relevantes:
        print("Advertencia: No se encontraron artículos con una relevancia mayor o igual al umbral.")

    return articulos_relevantes

In [11]:

def procesar_articulos_bibtex(keywords):
    """
    Procesa un archivo .bib, calcula la relevancia basada en keywords, y devuelve un DataFrame con Título, DOI y Relevancia.
    
    Parámetros:
    - file_paths: Lista de rutas de archivos .bib.
    - keywords: Lista de palabras clave para calcular la relevancia.
    
    Retorna:
    - df_articulos: DataFrame con las columnas 'Título', 'DOI' y 'Relevancia'.
    """
    
    # Cargar los artículos desde el archivo .bib
    articulos_df = eda_bib.convert_bib_to_df(file_paths)
    
    # Calcular relevancia basada en las palabras clave (keywords)
    resultados = calcular_relevancia_por_keywords(articulos_df, keywords)
    
    # Crear una lista vacía para almacenar los datos de los artículos
    articulos_data = []
    
    # Recorrer los resultados y extraer los artículos y relevancias
    for articulo, relevancia in resultados:
        articulos_data.append({
            'Título': articulo['Title'],  # Acceder a la columna del DataFrame
            'DOI': articulo.get('DOI', 'DOI no disponible'),  # Asegurarse de que exista el DOI
            'Relevancia': relevancia,
        })
    
    # Crear un DataFrame con los datos de los artículos
    df_articulos = pd.DataFrame(articulos_data)
    
    # Eliminar duplicados basados en el título y el DOI
    df_articulos = df_articulos.drop_duplicates(subset=['Título', 'DOI'])
    
    return df_articulos

In [12]:
# Inicializar un DataFrame global vacío para consolidar todos los resultados
df_articulos_global = pd.DataFrame(columns=['Título', 'DOI', 'Relevancia'])

## Deserción Escolar (Dropout Prediction)

In [13]:
# Cargar los artículos desde el archivo .bib
df_dropout = procesar_articulos_bibtex(query_keywords['dropout_prediction'])
df_dropout

Primeros 5 artículos procesados:
 0                                                index
1                                              content
2    traditional educational system certain nation ...
3    efficiency flexibility resilience energy syste...
4    learning analytics aim discover class student ...
dtype: object


Unnamed: 0,Título,DOI,Relevancia
0,Advancing school dropout early warning systems...,DOI no disponible,0.308672
1,"A Methodology to Design, Develop, and Evaluate...",DOI no disponible,0.200697
2,Early prediction models and crucial factor ext...,DOI no disponible,0.192006
3,Reducing Dropout Rate through a Deep Learning ...,DOI no disponible,0.191642
4,Prediction of Students' Early Dropout Based on...,DOI no disponible,0.182575
5,Redefining profit metrics for boosting student...,DOI no disponible,0.179363
6,Predicting dropout at master level using educa...,DOI no disponible,0.178539
7,An Early Warning System to Identify and Interv...,DOI no disponible,0.169841
8,A Real-Time Predictive Model for Identifying C...,DOI no disponible,0.16421
9,Predicting dropout from higher education: Evid...,DOI no disponible,0.161738


## Predicción del Rendimiento Académico (Student Performance Prediction)

In [14]:
df_student_performance_prediction = procesar_articulos_bibtex(query_keywords['student_performance_prediction'])
df_student_performance_prediction

Primeros 5 artículos procesados:
 0                                                index
1                                              content
2    traditional educational system certain nation ...
3    efficiency flexibility resilience energy syste...
4    learning analytics aim discover class student ...
dtype: object
Palabras clave procesadas: student performance academic success predicting academic performance student achievement prediction performance analytics learning outcome prediction student success model academic performance factor school


Unnamed: 0,Título,DOI,Relevancia
0,Predicting Student Performance Using Data Mini...,DOI no disponible,0.148586
1,Educational Big Data Mining: Comparison of Mul...,DOI no disponible,0.143239
2,A machine learning prediction of academic perf...,DOI no disponible,0.113145
3,A Fuzzy Model for Reasoning and Predicting Stu...,DOI no disponible,0.105057
4,Targeted projection pursuit similarity based a...,DOI no disponible,0.10455
5,HELA: A Novel Hybrid Ensemble Learning Algorit...,DOI no disponible,0.102067
6,Educational data mining: Predictive analysis o...,DOI no disponible,0.100561
7,Predicting Academic Performance Using an Effic...,DOI no disponible,0.099032
8,Student's performance prediction model and aff...,DOI no disponible,0.094924
9,Student Performance Prediction Model for Predi...,DOI no disponible,0.094827


## Detección de Riesgo y Sistemas de Alerta Temprana

In [15]:
# Cargar los artículos desde el archivo .bib
df_risk_detection = procesar_articulos_bibtex(query_keywords['risk_detection'])
df_risk_detection

Primeros 5 artículos procesados:
 0                                                index
1                                              content
2    traditional educational system certain nation ...
3    efficiency flexibility resilience energy syste...
4    learning analytics aim discover class student ...
dtype: object
Palabras clave procesadas: student risk student early detection academic risk early identification student risk academic risk factor predictive modeling student risk risk prediction education early intervention education school predictive model learning analytics


Unnamed: 0,Título,DOI,Relevancia
0,Predictive models of academic risk in computin...,DOI no disponible,0.229741
1,Identifying False Positives When Targeting Stu...,DOI no disponible,0.128177
2,Predicting Students at Risk of Early Dropping ...,DOI no disponible,0.115356
3,Potential Risks of Artificial Intelligence Int...,DOI no disponible,0.099318
4,Early Detecting Students at Risk Using Machine...,DOI no disponible,0.093662
5,Dropout early warning systems for high school ...,DOI no disponible,0.090862
6,Predictive Model Using a Machine Learning Appr...,DOI no disponible,0.090437
7,Framework for Automatically Suggesting Remedia...,DOI no disponible,0.088568
8,Interpretable machine learning predicts postpa...,DOI no disponible,0.087957
9,A Systematic Literature Review of Student' Per...,DOI no disponible,0.086503


## Uso de Datos de Moodle y Plataformas de E-Learning

In [16]:
# Cargar los artículos desde el archivo .bib
df_moodle = procesar_articulos_bibtex(query_keywords['moodle'])
df_moodle

Primeros 5 artículos procesados:
 0                                                index
1                                              content
2    traditional educational system certain nation ...
3    efficiency flexibility resilience energy syste...
4    learning analytics aim discover class student ...
dtype: object
Palabras clave procesadas: moodle learning management system virtual learning environment student engagement moodle analytics online learning platform lm engagement learning platform interaction school


Unnamed: 0,Título,DOI,Relevancia
0,Learning Analytics Intervention Improves Stude...,DOI no disponible,0.143849
1,Moodle quizzes and their usability for formati...,DOI no disponible,0.120046
2,Toward Precision Education: Educational Data M...,DOI no disponible,0.114571
3,Beyond Performance Analytics: Using Learning A...,DOI no disponible,0.111128
4,Towards a better understanding of the role of ...,DOI no disponible,0.110019
5,Student's Interest and Opinion Towards Online ...,DOI no disponible,0.107956
6,Predicting academic performance of students fr...,DOI no disponible,0.107071
7,Formative Assessment Tasks as Indicators of St...,DOI no disponible,0.105817
8,Developing Engagement in the Learning Manageme...,DOI no disponible,0.105006
9,Using LMS Log Data to Explore Student Engageme...,DOI no disponible,0.1048


## Procesos Académicos y Analítica Predictiva

In [17]:
df_processes= procesar_articulos_bibtex(query_keywords['processes'])
df_processes


Primeros 5 artículos procesados:
 0                                                index
1                                              content
2    traditional educational system certain nation ...
3    efficiency flexibility resilience energy syste...
4    learning analytics aim discover class student ...
dtype: object
Palabras clave procesadas: academic analytics decision support system educational data mining decision making education process optimization education data science education student data processing learning analytics process school


Unnamed: 0,Título,DOI,Relevancia
0,Early detection of student degree-level academ...,DOI no disponible,0.100291
1,Data mining-based decision support system for ...,DOI no disponible,0.094791
2,Enhancing personalized learning with explainab...,DOI no disponible,0.094643
3,Reviewing the differences between learning ana...,DOI no disponible,0.080191
4,Comparison of learning analytics and education...,DOI no disponible,0.079775
5,Mining Big Data in Education: Affordances and ...,DOI no disponible,0.076115
6,Educational data mining to predict students' a...,DOI no disponible,0.075666
7,Data-driven Decision Making in Higher Educatio...,DOI no disponible,0.073089
8,Educational Data Mining to Predict Students' A...,DOI no disponible,0.07015
9,Investigation of factors affecting student per...,DOI no disponible,0.069259


#### DF Con todos los resultados

In [18]:
df_articulos_global = pd.concat([df_articulos_global, df_moodle, df_processes, df_risk_detection, df_student_performance_prediction], ignore_index=True)
df_articulos_global

  df_articulos_global = pd.concat([df_articulos_global, df_moodle, df_processes, df_risk_detection, df_student_performance_prediction], ignore_index=True)


Unnamed: 0,Título,DOI,Relevancia
0,Learning Analytics Intervention Improves Stude...,DOI no disponible,0.143849
1,Moodle quizzes and their usability for formati...,DOI no disponible,0.120046
2,Toward Precision Education: Educational Data M...,DOI no disponible,0.114571
3,Beyond Performance Analytics: Using Learning A...,DOI no disponible,0.111128
4,Towards a better understanding of the role of ...,DOI no disponible,0.110019
5,Student's Interest and Opinion Towards Online ...,DOI no disponible,0.107956
6,Predicting academic performance of students fr...,DOI no disponible,0.107071
7,Formative Assessment Tasks as Indicators of St...,DOI no disponible,0.105817
8,Developing Engagement in the Learning Manageme...,DOI no disponible,0.105006
9,Using LMS Log Data to Explore Student Engageme...,DOI no disponible,0.1048


In [19]:
# Eliminar duplicados basados en el título
df_articulos_global = df_articulos_global.drop_duplicates(subset=['Título'])

In [20]:
for i, row in df_articulos_global.iterrows():
    print(f"{row['Título']}")
    print(f"    DOI: {row['DOI']}")
    print(f"    Relevancia: {row['Relevancia']}")

Learning Analytics Intervention Improves Students' Engagement in Online Learning
    DOI: DOI no disponible
    Relevancia: 0.14384876231306826
Moodle quizzes and their usability for formative assessment of academic writing
    DOI: DOI no disponible
    Relevancia: 0.12004618311881643
Toward Precision Education: Educational Data Mining and Learning Analytics for Identifying Students' Learning Patterns with Ebook Systems
    DOI: DOI no disponible
    Relevancia: 0.11457084752959716
Beyond Performance Analytics: Using Learning Analytics to Understand Learning Processes That Lead to Improved Learning Outcomes
    DOI: DOI no disponible
    Relevancia: 0.11112830831949554
Towards a better understanding of the role of visualization in online learning: A review
    DOI: DOI no disponible
    Relevancia: 0.11001923470143536
Student's Interest and Opinion Towards Online Education
    DOI: DOI no disponible
    Relevancia: 0.10795647947614344
Predicting academic performance of students from V