# Data preprocessing

Author: CHEN Ee Heng  
Date: 29.08.2023  

In [1]:
# CONSTANTS + VARIABLES

DATA_PATH = "../data/public_maps.csv"

LABEL_MAPPING_PATH = "../data/label_id.json"

TRANSLATED_TEXT_PATH = "../data/translated_text_googletrans.csv"

SAVE_PROCESSED_DATA = True
PROCESSED_DATA_PATH = "../data/public_maps_processed.csv"

In [2]:
import os
import json
import pandas as pd

In [3]:
# Raw data
df = pd.read_csv(DATA_PATH, delimiter=",")


# Reindexing
def remap_dict(column: str, start_idx=0) -> dict:
    return {k: v for v, k in enumerate(set(df[column].tolist()),
                                       start_idx)}


id_remap = remap_dict('idea_id', 2)
map_category_name_remap = remap_dict('map_category_name', 0)
df['map_category_id'] = df['map_category_name']
df = df.replace({
    'map_id': id_remap,
    'map_category_id': map_category_name_remap,
    'idea_id': id_remap,
    'idea_parent_id': id_remap,
})

# Replace NAN with 1
df['idea_parent_id'] = df['idea_parent_id'].fillna(1)

# Duplicate check and removal
df_dups = df[
    df.duplicated(
        subset=['map_id', 'map_title', 'map_rating',
                'map_category_name', 'idea_parent_id', 'idea_title'],
        keep=False
    )
]
non_parent_dups = df_dups[~df_dups['idea_id'].isin(df['idea_parent_id'])]
df = df.drop(non_parent_dups.index)

# Feature Enginnering 1 : Add idea parent depth
df['idea_parent_depth'] = 0
df_tmp = df.copy()
while True:
    df_tmp = df_tmp[df_tmp['idea_parent_id'].isin(df_tmp['idea_id'])]
    if df_tmp.index.empty:
        break
    df.loc[df_tmp.index, 'idea_parent_depth'] += 1

# Feature Enginnering 2 : Add map count, number of idea per map.
df['idea_per_map_count'] = 1
df['idea_per_map_count'] = df.groupby(
    'map_id')['idea_per_map_count'].transform('count')

# Feature Enginnering 3 : Add map title text count
df['map_title_count'] = 0
df['map_title_count'] = df['map_title'].str.len()

# Feature Enginnering 4 : Add idea title text count
df['idea_title_count'] = 0
df['idea_title_count'] = df['idea_title'].str.len()

df

Unnamed: 0,map_id,map_title,map_rating,map_category_name,idea_id,idea_parent_id,idea_title,map_category_id,idea_parent_depth,idea_per_map_count,map_title_count,idea_title_count
0,6782,PFK/Risikomanagement,50,Business,6782,1.0,My first mindmap,4,0,13,20,16
1,6782,PFK/Risikomanagement,50,Business,6783,6782.0,Ideas for my novel ...,4,1,13,20,22
2,6782,PFK/Risikomanagement,50,Business,6784,6782.0,Welcome again!,4,1,13,20,14
3,6782,PFK/Risikomanagement,50,Business,6785,6784.0,We hope you\'ll have fun\nwith MindMeister ...,4,2,13,20,46
4,6782,PFK/Risikomanagement,50,Business,6786,6784.0,... and some great ideas too!,4,2,13,20,29
...,...,...,...,...,...,...,...,...,...,...,...,...
13555,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,1271,1268.0,Medical/Wellness,3,2,13,31,16
13556,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,1274,1268.0,Lifestyle/Non-Medical,3,2,13,31,21
13557,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,2054,1136.0,Training Program,3,1,13,31,16
13558,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,2055,2054.0,Contracted Leaders,3,2,13,31,18


## Google translate

In [4]:
# Translate to english (~27min for 9k texts)

if not os.path.exists(TRANSLATED_TEXT_PATH):

    from googletrans import Translator

    translator = Translator()

    with open(TRANSLATED_TEXT_PATH, "w", encoding="utf-8") as f:

        f.write(f"src,dst\n")

        def write_out(_text) -> None:
            trans = translator.translate(_text, dest='en', src='auto')
            f.write(f"{trans.origin.replace(',', 'COMMAHERE')},"
                    f"{trans.text.replace(',', 'COMMAHERE')}"
                    f"\n")

        df_tmp = pd.concat([df['map_title'], df['idea_title']])
        df_tmp = df_tmp.drop_duplicates()
        df_tmp = df_tmp.apply(write_out)

translation_df = pd.read_csv(TRANSLATED_TEXT_PATH, delimiter=",")
translation_df = translation_df.replace('COMMAHERE', ',', regex=True)
remap_dict = pd.Series(translation_df.dst.values,
                       index=translation_df.src.values).to_dict()
df['map_title_en'] = df['map_title']
df['idea_title_en'] = df['idea_title']
df = df.replace({
    'map_title_en': remap_dict,
    'idea_title_en': remap_dict,
})
df['map_title_en'] = df['map_title_en'].fillna('NA')
df['idea_title_en'] = df['idea_title_en'].fillna('NA')
df

Unnamed: 0,map_id,map_title,map_rating,map_category_name,idea_id,idea_parent_id,idea_title,map_category_id,idea_parent_depth,idea_per_map_count,map_title_count,idea_title_count,map_title_en,idea_title_en
0,6782,PFK/Risikomanagement,50,Business,6782,1.0,My first mindmap,4,0,13,20,16,PFK/risk management,My first mindmap
1,6782,PFK/Risikomanagement,50,Business,6783,6782.0,Ideas for my novel ...,4,1,13,20,22,PFK/risk management,Ideas for my novel ...
2,6782,PFK/Risikomanagement,50,Business,6784,6782.0,Welcome again!,4,1,13,20,14,PFK/risk management,Welcome again!
3,6782,PFK/Risikomanagement,50,Business,6785,6784.0,We hope you\'ll have fun\nwith MindMeister ...,4,2,13,20,46,PFK/risk management,We hope you'll have fun\with MindMeister ...
4,6782,PFK/Risikomanagement,50,Business,6786,6784.0,... and some great ideas too!,4,2,13,20,29,PFK/risk management,... and some great ideas too!
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13555,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,1271,1268.0,Medical/Wellness,3,2,13,31,16,TOWARDS A DIALOGUE\rPEDAGOGY,Medical/Wellness
13556,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,1274,1268.0,Lifestyle/Non-Medical,3,2,13,31,21,TOWARDS A DIALOGUE\rPEDAGOGY,Lifestyle/Non-Medical
13557,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,2054,1136.0,Training Program,3,1,13,31,16,TOWARDS A DIALOGUE\rPEDAGOGY,Training Program
13558,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,2055,2054.0,Contracted Leaders,3,2,13,31,18,TOWARDS A DIALOGUE\rPEDAGOGY,Contracted Leaders


## English BERT - 110M Params

In [5]:
# # Testing BERT
# from transformers import BertTokenizer
# from transformers import BertModel

# text = "This is a sample sentence to sanity check the model."

# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# model = BertModel.from_pretrained("bert-base-cased")

# encoded_input = tokenizer(text, return_tensors='pt')

# decoded_text = tokenizer.decode(encoded_input.data['input_ids'].tolist()[0],
#                                 skip_special_tokens=True,
#                                 clean_up_tokenization_spaces=True,
#                                 spaces_between_special_tokens=True)

# output = model(**encoded_input)

# print("Original text     :", text)
# print("Decoded tokens    :", decoded_text)
# print("last_hidden_state :", output.last_hidden_state.shape)
# print("pooler_output     :", output.pooler_output.shape)
# # print(output.attentions.shape)
# # print(output.cross_attentions.shape)

In [6]:
# Encode text with BERT (~20min for 13500 row texts)

from transformers import BertTokenizer
from transformers import BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained("bert-base-cased")


def tokenize(x):
    return tokenizer.encode(x)


def encode(x):
    encoded_input = tokenizer(x, return_tensors='pt')
    output = model(**encoded_input)
    return output.pooler_output[0].tolist()


df['map_title_en_tok_bert'] = df['map_title_en'].apply(tokenize)
df['map_title_en_emb_bert'] = df['map_title_en'].apply(encode)

df['idea_title_en_tok_bert'] = df['idea_title_en'].apply(tokenize)
df['idea_title_en_emb_bert'] = df['idea_title_en'].apply(encode)

df

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,map_id,map_title,map_rating,map_category_name,idea_id,idea_parent_id,idea_title,map_category_id,idea_parent_depth,idea_per_map_count,map_title_count,idea_title_count,map_title_en,idea_title_en,map_title_en_tok_bert,map_title_en_emb_bert,idea_title_en_tok_bert,idea_title_en_emb_bert
0,6782,PFK/Risikomanagement,50,Business,6782,1.0,My first mindmap,4,0,13,20,16,PFK/risk management,My first mindmap,"[101, 153, 2271, 2428, 120, 3187, 2635, 102]","[-0.6748764514923096, 0.46971118450164795, 0.9...","[101, 1422, 1148, 1713, 1918, 1643, 102]","[-0.6327304244041443, 0.407027930021286, 0.999..."
1,6782,PFK/Risikomanagement,50,Business,6783,6782.0,Ideas for my novel ...,4,1,13,20,22,PFK/risk management,Ideas for my novel ...,"[101, 153, 2271, 2428, 120, 3187, 2635, 102]","[-0.6748764514923096, 0.46971118450164795, 0.9...","[101, 146, 18747, 1116, 1111, 1139, 2281, 119,...","[-0.7442804574966431, 0.5349408388137817, 0.99..."
2,6782,PFK/Risikomanagement,50,Business,6784,6782.0,Welcome again!,4,1,13,20,14,PFK/risk management,Welcome again!,"[101, 153, 2271, 2428, 120, 3187, 2635, 102]","[-0.6748764514923096, 0.46971118450164795, 0.9...","[101, 12050, 1254, 106, 102]","[-0.836165726184845, 0.5389016270637512, 0.999..."
3,6782,PFK/Risikomanagement,50,Business,6785,6784.0,We hope you\'ll have fun\nwith MindMeister ...,4,2,13,20,46,PFK/risk management,We hope you'll have fun\with MindMeister ...,"[101, 153, 2271, 2428, 120, 3187, 2635, 102]","[-0.6748764514923096, 0.46971118450164795, 0.9...","[101, 1284, 2810, 1128, 112, 1325, 1138, 4106,...","[-0.7351182103157043, 0.49911975860595703, 0.9..."
4,6782,PFK/Risikomanagement,50,Business,6786,6784.0,... and some great ideas too!,4,2,13,20,29,PFK/risk management,... and some great ideas too!,"[101, 153, 2271, 2428, 120, 3187, 2635, 102]","[-0.6748764514923096, 0.46971118450164795, 0.9...","[101, 119, 119, 119, 1105, 1199, 1632, 4133, 1...","[-0.7741515040397644, 0.5312414765357971, 0.99..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13555,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,1271,1268.0,Medical/Wellness,3,2,13,31,16,TOWARDS A DIALOGUE\rPEDAGOGY,Medical/Wellness,"[101, 16972, 11840, 23354, 1708, 138, 141, 998...","[-0.5438150763511658, 0.37422019243240356, 0.9...","[101, 3875, 120, 2119, 1757, 102]","[-0.8046272397041321, 0.5268841981887817, 0.99..."
13556,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,1274,1268.0,Lifestyle/Non-Medical,3,2,13,31,21,TOWARDS A DIALOGUE\rPEDAGOGY,Lifestyle/Non-Medical,"[101, 16972, 11840, 23354, 1708, 138, 141, 998...","[-0.5438150763511658, 0.37422019243240356, 0.9...","[101, 2583, 19994, 120, 7922, 118, 3875, 102]","[-0.7953608632087708, 0.5196728110313416, 0.99..."
13557,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,2054,1136.0,Training Program,3,1,13,31,16,TOWARDS A DIALOGUE\rPEDAGOGY,Training Program,"[101, 16972, 11840, 23354, 1708, 138, 141, 998...","[-0.5438150763511658, 0.37422019243240356, 0.9...","[101, 5513, 4659, 102]","[-0.771316409111023, 0.4543556272983551, 0.999..."
13558,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,2055,2054.0,Contracted Leaders,3,2,13,31,18,TOWARDS A DIALOGUE\rPEDAGOGY,Contracted Leaders,"[101, 16972, 11840, 23354, 1708, 138, 141, 998...","[-0.5438150763511658, 0.37422019243240356, 0.9...","[101, 16752, 16550, 20880, 102]","[-0.7636327147483826, 0.55877685546875, 0.9999..."


## Multilingual BERT - 110M Params

In [7]:
# Encode multilingual text with BERT (~20min for 13500 row texts)

from transformers import BertTokenizer
from transformers import BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained("bert-base-multilingual-cased")


def tokenize(x):
    return tokenizer.encode(x)


def encode(x):
    encoded_input = tokenizer(x, return_tensors='pt')
    output = model(**encoded_input)
    return output.pooler_output[0].tolist()


df['map_title_tok_bert'] = df['map_title'].apply(tokenize)
df['map_title_emb_bert'] = df['map_title'].apply(encode)

df['idea_title_tok_bert'] = df['idea_title'].apply(tokenize)
df['idea_title_emb_bert'] = df['idea_title'].apply(encode)

df

Unnamed: 0,map_id,map_title,map_rating,map_category_name,idea_id,idea_parent_id,idea_title,map_category_id,idea_parent_depth,idea_per_map_count,...,map_title_en,idea_title_en,map_title_en_tok_bert,map_title_en_emb_bert,idea_title_en_tok_bert,idea_title_en_emb_bert,map_title_tok_bert,map_title_emb_bert,idea_title_tok_bert,idea_title_emb_bert
0,6782,PFK/Risikomanagement,50,Business,6782,1.0,My first mindmap,4,0,13,...,PFK/risk management,My first mindmap,"[101, 153, 2271, 2428, 120, 3187, 2635, 102]","[-0.6748764514923096, 0.46971118450164795, 0.9...","[101, 1422, 1148, 1713, 1918, 1643, 102]","[-0.6327304244041443, 0.407027930021286, 0.999...","[101, 153, 88263, 120, 155, 14553, 12910, 1163...","[0.28258460760116577, -0.02321775071322918, 0....","[101, 11590, 10422, 21133, 10369, 10410, 102]","[0.235234335064888, -0.12797075510025024, 0.36..."
1,6782,PFK/Risikomanagement,50,Business,6783,6782.0,Ideas for my novel ...,4,1,13,...,PFK/risk management,Ideas for my novel ...,"[101, 153, 2271, 2428, 120, 3187, 2635, 102]","[-0.6748764514923096, 0.46971118450164795, 0.9...","[101, 146, 18747, 1116, 1111, 1139, 2281, 119,...","[-0.7442804574966431, 0.5349408388137817, 0.99...","[101, 153, 88263, 120, 155, 14553, 12910, 1163...","[0.28258460760116577, -0.02321775071322918, 0....","[101, 100426, 10142, 15127, 14671, 119, 119, 1...","[0.2957175374031067, 0.0010778368450701237, 0...."
2,6782,PFK/Risikomanagement,50,Business,6784,6782.0,Welcome again!,4,1,13,...,PFK/risk management,Welcome again!,"[101, 153, 2271, 2428, 120, 3187, 2635, 102]","[-0.6748764514923096, 0.46971118450164795, 0.9...","[101, 12050, 1254, 106, 102]","[-0.836165726184845, 0.5389016270637512, 0.999...","[101, 153, 88263, 120, 155, 14553, 12910, 1163...","[0.28258460760116577, -0.02321775071322918, 0....","[101, 39728, 13123, 106, 102]","[0.2114391028881073, 0.013439947739243507, 0.2..."
3,6782,PFK/Risikomanagement,50,Business,6785,6784.0,We hope you\'ll have fun\nwith MindMeister ...,4,2,13,...,PFK/risk management,We hope you'll have fun\with MindMeister ...,"[101, 153, 2271, 2428, 120, 3187, 2635, 102]","[-0.6748764514923096, 0.46971118450164795, 0.9...","[101, 1284, 2810, 1128, 112, 1325, 1138, 4106,...","[-0.7351182103157043, 0.49911975860595703, 0.9...","[101, 153, 88263, 120, 155, 14553, 12910, 1163...","[0.28258460760116577, -0.02321775071322918, 0....","[101, 12865, 50725, 13028, 165, 112, 22469, 10...","[0.28184330463409424, -0.046994708478450775, 0..."
4,6782,PFK/Risikomanagement,50,Business,6786,6784.0,... and some great ideas too!,4,2,13,...,PFK/risk management,... and some great ideas too!,"[101, 153, 2271, 2428, 120, 3187, 2635, 102]","[-0.6748764514923096, 0.46971118450164795, 0.9...","[101, 119, 119, 119, 1105, 1199, 1632, 4133, 1...","[-0.7741515040397644, 0.5312414765357971, 0.99...","[101, 153, 88263, 120, 155, 14553, 12910, 1163...","[0.28258460760116577, -0.02321775071322918, 0....","[101, 119, 119, 119, 10111, 11152, 14772, 2380...","[0.39841699600219727, -0.05927441269159317, 0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13555,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,1271,1268.0,Medical/Wellness,3,2,13,...,TOWARDS A DIALOGUE\rPEDAGOGY,Medical/Wellness,"[101, 16972, 11840, 23354, 1708, 138, 141, 998...","[-0.5438150763511658, 0.37422019243240356, 0.9...","[101, 3875, 120, 2119, 1757, 102]","[-0.8046272397041321, 0.5268841981887817, 0.99...","[101, 145, 30340, 35976, 26578, 10738, 80468, ...","[0.40213742852211, -0.058702077716588974, 0.01...","[101, 18363, 120, 37025, 14010, 102]","[0.2968033254146576, -0.031075352802872658, 0...."
13556,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,1274,1268.0,Lifestyle/Non-Medical,3,2,13,...,TOWARDS A DIALOGUE\rPEDAGOGY,Lifestyle/Non-Medical,"[101, 16972, 11840, 23354, 1708, 138, 141, 998...","[-0.5438150763511658, 0.37422019243240356, 0.9...","[101, 2583, 19994, 120, 7922, 118, 3875, 102]","[-0.7953608632087708, 0.5196728110313416, 0.99...","[101, 145, 30340, 35976, 26578, 10738, 80468, ...","[0.40213742852211, -0.058702077716588974, 0.01...","[101, 10337, 67625, 120, 14890, 118, 18363, 102]","[0.3152565658092499, -0.025143641978502274, 0...."
13557,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,2054,1136.0,Training Program,3,1,13,...,TOWARDS A DIALOGUE\rPEDAGOGY,Training Program,"[101, 16972, 11840, 23354, 1708, 138, 141, 998...","[-0.5438150763511658, 0.37422019243240356, 0.9...","[101, 5513, 4659, 102]","[-0.771316409111023, 0.4543556272983551, 0.999...","[101, 145, 30340, 35976, 26578, 10738, 80468, ...","[0.40213742852211, -0.058702077716588974, 0.01...","[101, 26442, 13715, 102]","[0.04530977085232735, -0.1205960065126419, 0.1..."
13558,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,2055,2054.0,Contracted Leaders,3,2,13,...,TOWARDS A DIALOGUE\rPEDAGOGY,Contracted Leaders,"[101, 16972, 11840, 23354, 1708, 138, 141, 998...","[-0.5438150763511658, 0.37422019243240356, 0.9...","[101, 16752, 16550, 20880, 102]","[-0.7636327147483826, 0.55877685546875, 0.9999...","[101, 145, 30340, 35976, 26578, 10738, 80468, ...","[0.40213742852211, -0.058702077716588974, 0.01...","[101, 62046, 27756, 37122, 10107, 102]","[0.16184566915035248, -0.07991621643304825, 0...."


### Multilingual M2M100 - 418M Params

In [8]:
# Encode multilingual text with M2M100 (~25min for 13500 row texts)

from transformers import M2M100Tokenizer
from transformers import M2M100ForConditionalGeneration

tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
model = model.get_encoder()

def tokenize(x):
    return tokenizer.encode(x)


def encode(x):
    encoded_input = tokenizer(x, return_tensors='pt')
    output = model(**encoded_input)
    return output.last_hidden_state.mean(axis=1)[0].tolist()

df['map_title_tok_m2m'] = df['map_title'].apply(tokenize)
df['map_title_emb_m2m'] = df['map_title'].apply(encode)

df['idea_title_tok_m2m'] = df['idea_title'].apply(tokenize)
df['idea_title_emb_m2m'] = df['idea_title'].apply(encode)

df

Unnamed: 0,map_id,map_title,map_rating,map_category_name,idea_id,idea_parent_id,idea_title,map_category_id,idea_parent_depth,idea_per_map_count,...,idea_title_en_tok_bert,idea_title_en_emb_bert,map_title_tok_bert,map_title_emb_bert,idea_title_tok_bert,idea_title_emb_bert,map_title_tok_m2m,map_title_emb_m2m,idea_title_tok_m2m,idea_title_emb_m2m
0,6782,PFK/Risikomanagement,50,Business,6782,1.0,My first mindmap,4,0,13,...,"[101, 1422, 1148, 1713, 1918, 1643, 102]","[-0.6327304244041443, 0.407027930021286, 0.999...","[101, 153, 88263, 120, 155, 14553, 12910, 1163...","[0.28258460760116577, -0.02321775071322918, 0....","[101, 11590, 10422, 21133, 10369, 10410, 102]","[0.235234335064888, -0.12797075510025024, 0.36...","[128022, 132, 506, 358, 57, 468, 51136, 12812,...","[0.12304291874170303, -0.2791493535041809, 0.0...","[128022, 6384, 42449, 9963, 72749, 2]","[-0.0812792256474495, 0.1837313175201416, 0.01..."
1,6782,PFK/Risikomanagement,50,Business,6783,6782.0,Ideas for my novel ...,4,1,13,...,"[101, 146, 18747, 1116, 1111, 1139, 2281, 119,...","[-0.7442804574966431, 0.5349408388137817, 0.99...","[101, 153, 88263, 120, 155, 14553, 12910, 1163...","[0.28258460760116577, -0.02321775071322918, 0....","[101, 100426, 10142, 15127, 14671, 119, 119, 1...","[0.2957175374031067, 0.0010778368450701237, 0....","[128022, 132, 506, 358, 57, 468, 51136, 12812,...","[0.12304291874170303, -0.2791493535041809, 0.0...","[128022, 18622, 46, 193, 1949, 58012, 10, 2]","[-0.05067373439669609, -0.2776373624801636, 0...."
2,6782,PFK/Risikomanagement,50,Business,6784,6782.0,Welcome again!,4,1,13,...,"[101, 12050, 1254, 106, 102]","[-0.836165726184845, 0.5389016270637512, 0.999...","[101, 153, 88263, 120, 155, 14553, 12910, 1163...","[0.28258460760116577, -0.02321775071322918, 0....","[101, 39728, 13123, 106, 102]","[0.2114391028881073, 0.013439947739243507, 0.2...","[128022, 132, 506, 358, 57, 468, 51136, 12812,...","[0.12304291874170303, -0.2791493535041809, 0.0...","[128022, 108938, 119164, 30, 2]","[-0.3350827693939209, 0.3044120669364929, 0.44..."
3,6782,PFK/Risikomanagement,50,Business,6785,6784.0,We hope you\'ll have fun\nwith MindMeister ...,4,2,13,...,"[101, 1284, 2810, 1128, 112, 1325, 1138, 4106,...","[-0.7351182103157043, 0.49911975860595703, 0.9...","[101, 153, 88263, 120, 155, 14553, 12910, 1163...","[0.28258460760116577, -0.02321775071322918, 0....","[101, 12865, 50725, 13028, 165, 112, 22469, 10...","[0.28184330463409424, -0.046994708478450775, 0...","[128022, 132, 506, 358, 57, 468, 51136, 12812,...","[0.12304291874170303, -0.2791493535041809, 0.0...","[128022, 3986, 117965, 8251, 10034, 12, 2279, ...","[-0.3275381922721863, 0.2549442946910858, -0.2..."
4,6782,PFK/Risikomanagement,50,Business,6786,6784.0,... and some great ideas too!,4,2,13,...,"[101, 119, 119, 119, 1105, 1199, 1632, 4133, 1...","[-0.7741515040397644, 0.5312414765357971, 0.99...","[101, 153, 88263, 120, 155, 14553, 12910, 1163...","[0.28258460760116577, -0.02321775071322918, 0....","[101, 119, 119, 119, 10111, 11152, 14772, 2380...","[0.39841699600219727, -0.05927441269159317, 0....","[128022, 132, 506, 358, 57, 468, 51136, 12812,...","[0.12304291874170303, -0.2791493535041809, 0.0...","[128022, 10, 1019, 93085, 104187, 63210, 38414...","[0.0018320721574127674, -0.008347602561116219,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13555,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,1271,1268.0,Medical/Wellness,3,2,13,...,"[101, 3875, 120, 2119, 1757, 102]","[-0.8046272397041321, 0.5268841981887817, 0.99...","[101, 145, 30340, 35976, 26578, 10738, 80468, ...","[0.40213742852211, -0.058702077716588974, 0.01...","[101, 18363, 120, 37025, 14010, 102]","[0.2968033254146576, -0.031075352802872658, 0....","[128022, 176, 5010, 2613, 87014, 132, 3390, 45...","[-0.37561386823654175, -0.23259896039962769, 0...","[128022, 96355, 57, 590, 748, 18689, 2]","[-0.38128915429115295, 0.18158744275569916, -0..."
13556,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,1274,1268.0,Lifestyle/Non-Medical,3,2,13,...,"[101, 2583, 19994, 120, 7922, 118, 3875, 102]","[-0.7953608632087708, 0.5196728110313416, 0.99...","[101, 145, 30340, 35976, 26578, 10738, 80468, ...","[0.40213742852211, -0.058702077716588974, 0.01...","[101, 10337, 67625, 120, 14890, 118, 18363, 102]","[0.3152565658092499, -0.025143641978502274, 0....","[128022, 176, 5010, 2613, 87014, 132, 3390, 45...","[-0.37561386823654175, -0.23259896039962769, 0...","[128022, 87549, 92780, 57, 73073, 7, 61712, 11...","[-0.3160032629966736, -0.24600836634635925, -0..."
13557,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,2054,1136.0,Training Program,3,1,13,...,"[101, 5513, 4659, 102]","[-0.771316409111023, 0.4543556272983551, 0.999...","[101, 145, 30340, 35976, 26578, 10738, 80468, ...","[0.40213742852211, -0.058702077716588974, 0.01...","[101, 26442, 13715, 102]","[0.04530977085232735, -0.1205960065126419, 0.1...","[128022, 176, 5010, 2613, 87014, 132, 3390, 45...","[-0.37561386823654175, -0.23259896039962769, 0...","[128022, 66344, 8700, 2]","[0.1791716367006302, -0.3687453269958496, -0.7..."
13558,1136,HACIA UNA PEDAGOGIA\rDIALOGANTE,50,Other,2055,2054.0,Contracted Leaders,3,2,13,...,"[101, 16752, 16550, 20880, 102]","[-0.7636327147483826, 0.55877685546875, 0.9999...","[101, 145, 30340, 35976, 26578, 10738, 80468, ...","[0.40213742852211, -0.058702077716588974, 0.01...","[101, 62046, 27756, 37122, 10107, 102]","[0.16184566915035248, -0.07991621643304825, 0....","[128022, 176, 5010, 2613, 87014, 132, 3390, 45...","[-0.37561386823654175, -0.23259896039962769, 0...","[128022, 112465, 115368, 117573, 397, 2]","[-0.501156747341156, -0.465686172246933, 0.777..."


In [9]:
if SAVE_PROCESSED_DATA:
    with open(LABEL_MAPPING_PATH, 'w') as f:
        json.dump(map_category_name_remap, f)
    df.to_csv(PROCESSED_DATA_PATH)