In [1]:
import pandas as pd

# read tsv file
test = pd.read_csv('test_df1.tsv', sep='\t')
train = pd.read_csv('train_df.tsv', sep='\t')

#join the two dataframes
df = pd.concat([test, train])

#remove a list of columns:
#df = df.drop(['idioma', 'editor', 'programa', 'direito_acesso', 'departamento', 'area_cnpq', 'keywords', 'tipo', 'patrocinio', 'abstract'], axis=1)

In [4]:
df.columns

Index(['data', 'uri', 'idioma', 'editor', 'programa', 'direito_acesso',
       'departamento', 'area_cnpq', 'titulo', 'tipo', 'patrocinio', 'keywords',
       'abstract', 'palavras_chave', 'resumo'],
      dtype='object')

In [2]:
# import a json file and load as as a dict

import json
with open('ods_regex_patterns.json') as f:
    data = json.load(f)

In [3]:
data

{'ods1': '\\b((?:ajuda financeira).*?(?:pobreza))|((?:ajuda financeira).*?(?:pobre|pobres))|((?:ajuda financeira).*?(?:divisão norte-sul))|((?:desenvolvimento financeiro).*?(?:pobreza))|((?:efeito sobre a).*?(?:distribuição|efeito distributivo|efeitos distributivos))|((?:proteção social).*?(?:acesso))|((?:rede de segurança).*?(?:pobres|pobre|vulnerável|vulneráveis))|((?:recurso econômico).*?(?:acesso))|((?:recursos econômicos).*?(?:acesso))|(extrema pobreza|alívio da pobreza|erradicação da pobreza|redução da pobreza|linha internacional de pobreza|empoderamento financeiro|trabalho infantil|ajuda ao desenvolvimento|proteção social|sistema de proteção social|microfinanc|resiliência dos pobres|banco de alimentos|bancos de alimentos)\\b',
 'ods2': '\\b((?:pequeno produtor).*?(?:fazenda|silvicultura|pastoril|agricultura|pesca|produtor de alimentos|produtores de alimentos))|((?:agricultura).*?(?:potássio))|((?:transgênicos).*?(?:alimentos))|((?:segurança alimentar).*?(?:diversidade genética))

In [10]:
import pandas as pd
import numpy as np
import re

def match_and_append(df, keyword_regex_dict):
    # Compile the regex patterns in the dictionary
    compiled_patterns = {k: re.compile(v) for k, v in keyword_regex_dict.items()}

    # Initialize an empty list for the ODS column
    ods_column = []

    # Loop through each row in the dataframe
    for index, row in df.iterrows():
        # Get the text and keyword for the current row
        text = row['resumo']
        keyword = row['palavras_chave']

        # Initialize an empty list for the ODS values for the current row
        ods_values = []

        # Loop through each regex pattern in the dictionary
        for key, pattern in compiled_patterns.items():
            # Find all matches of the regex pattern in the text
            matches = pattern.findall(text)

            # If matches are found, append the corresponding keyword to the ODS values list
            if matches:
                ods_values.append(key)

        # If no matches were found, append NaN
        if not ods_values:
            ods_values = np.nan

        # Append the ODS values for the current row to the ODS column
        ods_column.append(ods_values)

    # Assign the ODS column to the dataframe
    df['ODS'] = ods_column

    # Return the updated dataframe
    return df


# Apply the function to the dataframe

df_ods = match_and_append(df, data)
df_ods['ODS']

0                  NaN
1                  NaN
2       [ods10, ods16]
3                  NaN
4                  NaN
             ...      
6064               NaN
6065               NaN
6066               NaN
6067               NaN
6068               NaN
Name: ODS, Length: 8092, dtype: object

In [18]:
import os

def get_words_in_braces_from_files(folder_path):
    files_with_words = {}
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path) and file_path.endswith('.txt'):
            with open(file_path, 'r') as file:
                words_in_braces = []
                for line in file:
                    words_in_braces.extend([
                        word.strip("{}")
                        for word in line.split()
                        if word.startswith("{") and word.endswith("}")
                    ])
                if words_in_braces:
                    files_with_words[filename] = words_in_braces
    return files_with_words

# Usage example:
folder_path = 'strings-de-busca-ods'
result = get_words_in_braces_from_files(folder_path)
print(result)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 1017: character maps to <undefined>

In [5]:
import pandas as pd

def append_key_from_dict(df, words_ods):
    """
    Appends only to that line the key from a dict that is words ods
    :param df: DataFrame with columns 'palavras chave' and 'key'
    :param words_ods: dict with ods as keys and list of words as values
    :return: DataFrame with appended 'key' column
    """

    #create a ods column with specifiyng type object without any values
    df['ODS'] = None

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        for ods, words in words_ods.items():
            for word in words:
                # Check if the word is in the 'palavras chave' column
                if word in row['palavras_chave'] or word in row['resumo']:
                    # Append the key to the 'key' column
                    df.at[index, 'ODS'] = ods
                    # Break out of the inner loop since the key is found
                    break
            else:
                # If the word is not found in the row, continue to the next row
                continue
            # If the key is found, break out of the outer loop as well
            break

    return df

df_ods = append_key_from_dict(df, ods_words)

In [17]:
nan_count = df_ods['ODS'].isna().sum()

# Count the non-null values
value_counts = df_ods['ODS'].value_counts()

sum = df_ods['ODS'].value_counts().sum()

print(f'NaN count: {nan_count}')
print('Value counts:')
print(value_counts)
print('Sum:', sum)

NaN count: 5105
Value counts:
ODS
[ods10]                       1039
[ods16]                        690
[ods10, ods16]                 202
[ods3]                         162
[ods4]                         123
                              ... 
[ods8, ods11, ods12]             1
[ods4, ods8, ods10, ods16]       1
[ods2, ods14]                    1
[ods5, ods11, ods16]             1
[ods8, ods9, ods10]              1
Name: count, Length: 104, dtype: int64
Sum: 2987
