## Import necessary libraries

In [58]:
import json
import pandas as pd

## Read the keywords from json file

In [59]:
def read_keywords(file = 'keywords.json'):
    with open(file, 'r') as f:
        return json.load(f)

In [80]:
keywords_quimica = read_keywords('keywords_quimica.json')

## Classify the exercises using the keywords

In [75]:
def classify_exercise_type(df, subject, keywords_dict):
    """
    Classifies exercises in a DataFrame based on keywords for a specific subject.

    This function filters a DataFrame for a given subject, identifies topics within the subject,
    and classifies each exercise statement based on the provided keywords dictionary. It returns
    the indexes of the classified exercises and their corresponding classifications.

    Parameters:
    ----------
    df : pandas.DataFrame
        The DataFrame containing the exercises. It must have the columns 'subject', 'topic', and 'statement'.
    subject : str
        The subject to filter the DataFrame by (e.g., 'Chemistry').
    keywords_dict : dict
        A dictionary where keys are topics and values are dictionaries of keyword-classification pairs.
        For example:
        {
            'Topic1': {'keyword1': 'classification1', 'keyword2': 'classification2'},
            'Topic2': {'keyword3': 'classification3'}
        }

    Returns:
    -------
    tuple
        A tuple containing:
        - indexes (list): A list of indexes of the classified exercises.
        - exercise_types (list): A list of classifications for each exercise. Each classification can be:
            - A single classification (str) if one keyword matches.
            - A list of classifications (list of str) if multiple keywords match.
            - 'varios' (str) if no keywords match.

    Example:
    -------
    >>> data = {
    ...     'subject': ['Chemistry', 'Chemistry', 'Math'],
    ...     'topic': ['Equilibrium', 'Equilibrium', 'Algebra'],
    ...     'statement': ['Increase pressure', 'True or false?', 'Solve for x']
    ... }
    >>> df = pd.DataFrame(data)
    >>> keywords_dict = {
    ...     'Equilibrium': {'Increase': 'LeChatelier', 'True': 'True/False'}
    ... }
    >>> classify_exercise_type(df, 'Chemistry', keywords_dict)
    ([0, 1], [['LeChatelier'], 'True/False'])
    """
    # filter the DataFrame by subject
    subset = df[df.subject == subject]
    # get the unique topics within the
    # filtered DataFrame
    subset_topics = subset.topic.unique()
    # store the indexes of the exercises that
    # are classified
    indexes = []
    exercise_types = []
    # classify exercises by topic
    # Ex.: 'Equilibrio Químico'
    for topic in subset_topics:
        # subset all the exercises belonging
        # to the current topic
        subset_topic = subset[subset.topic == topic]
        
        # get all the keywords for the current topic
        keywords_for_topic = keywords_dict.get(topic)
        
        # check if there are established keywords
        # for the current topic
        if keywords_for_topic:

            
            # store the indexes of the exercises
            # that will be classified
            indexes.extend(subset_topic.index)
            
            # classify each statement
            for statement in subset_topic.statement:
                classified = False
                types = []
                for key, value in keywords_for_topic.items():
                    if key in statement:
                        types.append(value)
                        classified = True

                # add the types to exercise_types
                if classified:  # the statement belongs to at least one keyword

                    # remove duplicates in the assigned
                    # exercise types
                    types = list(set(types))
                    
                    # exercise matched only one keyword
                    if len(types) == 1:
                        exercise_types.append(types[0])
                    # exercise matched multiple keywords
                    else:
                        
                        exercise_types.append(types)

                # exercise didn´t match any keyword
                else:
                    exercise_types.append(f'{topic} varios')
        
    return indexes, exercise_types

## Read the exercises dataframe

In [76]:
df = pd.read_csv('./csv/TODOS.csv')

In [72]:
df.head()

Unnamed: 0,subject,year,topic,exam,exercise_number,statement
0,Física,2016,Física cuántica y nuclear,Junio,"Ejercicio 2, Opción B",a) Teoría de Einstein del efecto fotoeléctrico...
1,Física,2016,Física cuántica y nuclear,Junio,"Ejercicio 4, Opción A",El * Pb emite dos partículas beta y se transfo...
2,Física,2016,Física cuántica y nuclear,Reserva 2,"Ejercicio 4, Opción A",a) Calcule la energía liberada en el proceso p...
3,Física,2016,Física cuántica y nuclear,Reserva 3,"Ejercicio 2, Opción A",a) ¿Qué se entiende por dualidad onda-corpúscu...
4,Física,2016,Física cuántica y nuclear,Reserva 4,"Ejercicio 2, Opción A",a) Escriba la ley de desintegración radiactiva...


## Classify exercises and add their types to dataframe

In [81]:
# classify exercises
indexes, types = classify_exercise_type(df, 'Química', keywords_quimica)
#create a new column
df['exercise_type'] = pd.NA
# transform the indexes and types into a pandas Series
type_series = pd.Series(types, index=indexes)
#finally, set the types for the classified exercises
df.loc[type_series.index, 'exercise_type'] = type_series

## Check dataframe after classification process

In [82]:
df[df.subject == 'Química'].sample(10)

Unnamed: 0,subject,year,topic,exam,exercise_number,statement,exercise_type
1533,Química,2015,Equilibrio Químico,Junio,"Ejercicio 4, Opción A",b) La energía de activación no varía\ne) La ve...,Equilibrio Químico varios
3257,Química,2018,Enlace Químico,Reserva 1,"Ejercicio 2, Opción B","a) Dibuje la molécula de eteno (CH, =CH),), in...","[Estructuras de Lewis, Hibridación del átomo c..."
2347,Química,2001,Reacciones Redox,Reserva 3,"Ejercicio 6, Opción B",El principal método de obtención del aluminio ...,Reacciones Redox varios
1969,Química,2002,Equilibrio Químico,Reserva 1,"Ejercicio 5, Opción B","En un recipiente de 10 L se hacen reaccionar, ...",Equilibrio Químico varios
2336,Química,2000,Reacciones Redox,Reserva 1,"Ejercicio 5, Opción B",Dada la reacción redox en disolución acuosa:\n...,Ajuste reacción redox
1987,Química,2004,Equilibrio Químico,Junio,"Ejercicio 6, Opción B","En un recipiente de 10 litros a 800* K, se int...",Equilibrio Químico varios
1973,Química,2002,Equilibrio Químico,Reserva 3,"Ejercicio 6, Opción B","Una muestra de 653 g de NH,HS se introduce en ...",Equilibrio Químico varios
1656,Química,2009,Ácido Base,Septiembre,"Ejercicio 4, Opción B","En medio acuoso, según la teoría de Brónsted-L...",Teoría de Brönsted-Lowry
2188,Química,2003,Configuración Electrónica,Reserva 2,"Ejercicio 2, Opción B",Dadas las siguientes configuraciones electróni...,Ion más estable
2645,Química,2018,Reactividad Orgánica,Reserva 1,"Ejercicio 4, Opción A",CH ¿CH(OH)CHO\na) Identifique y nombre los gru...,Isomería Óptica


## Export classified dataframe

In [85]:
df.to_csv('./csv/TODOS_classified.csv', index = False)