In [25]:
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
import ast
import numpy as np
from tqdm import tqdm

In [26]:
# load HuggingFace Dataset of the encoded image descriptions
ds_enc_desc = load_dataset("gips-mai/enc_descr")

In [27]:
# load HuggingFace Dataset of the encoded images
ds_enc_img = load_dataset("gips-mai/enc_img")

In [28]:
# merges two df with the image description and image encodings on img_id
def merge_encoding(desc_df, img_df):
        img_df.rename(columns={"encoding": "img_encoding"}, inplace=True)
        desc_df.rename(columns={"encoding": "desc_encoding"}, inplace=True)

        return desc_df.merge(img_df, how="left", on='img_id')

       
splits = ["00", "01","02","03","04"] # list of splits  
dfs_list = [] # list to store merged df

#turns each split 
for split in tqdm(splits):
       desc_df = pd.DataFrame(ds_enc_desc[split]) # convert encoded image description dataset for current split to df
       img_df = pd.DataFrame(ds_enc_img[split]) # convert encoded image dataset for current split to df
       merged_df = merge_encoding(desc_df, img_df) # merges two df for current split
       dfs_list.append(merged_df) # append the merged df to list
       

100%|██████████| 5/5 [01:31<00:00, 18.37s/it]


In [30]:
dfs_list[0]

Unnamed: 0,img_id,resp,desc_encoding,img_encoding
0,832038657676498.jpg,The image shows a scenic mountain road with a ...,"[-0.032639890909194946, 0.058135829865932465, ...","[0.59504234790802, -0.30037447810173035, -0.45..."
1,764901171179197.jpg,This image depicts a street scene with various...,"[-0.029522016644477844, 0.01743047498166561, 0...","[0.851909339427948, 0.5743611454963684, 1.1950..."
2,1586299104896708.jpg,The image shows a two-lane road with a car dri...,"[-0.024870363995432854, 0.03592754155397415, -...","[0.7949824929237366, 0.3025827705860138, 0.685..."
3,776770333209553.jpg,The image shows a two-lane road with a clear s...,"[-0.0036864662542939186, 0.04019076004624367, ...","[0.2773040533065796, -0.1700619012117386, 0.20..."
4,306256421202453.jpg,The image shows a multi-lane road with several...,"[0.004503166303038597, 0.04145127907395363, -0...","[0.435570627450943, 0.8325627446174622, 0.8094..."
...,...,...,...,...
1003,2838956239753806.jpg,The image shows a highway with a clear blue sk...,"[-0.04738913103938103, 0.046458445489406586, -...","[0.8999408483505249, -0.03346928581595421, -0...."
1004,165938738786151.jpg,The image shows a snowy forest with bare trees...,"[-0.016818419098854065, -0.003069913247600198,...","[0.29846423864364624, 1.655393362045288, -0.73..."
1005,2104121973099956.jpg,This image shows a serene residential street s...,"[-0.05949201434850693, 0.055988047271966934, -...","[-0.08558104932308197, 0.045043960213661194, -..."
1006,381282286444714.jpg,The image shows a clear day with a blue sky an...,"[-0.022491633892059326, -0.011690421029925346,...","[-0.8761512041091919, 0.2191329002380371, 1.75..."


In [31]:
# load training data
df_train = pd.read_csv("train.csv")

  df_train = pd.read_csv("train.csv")


In [32]:
# load one-hot encoding of ISO" codes
df_one_hot_enc = pd.read_csv("encodings.csv")

In [33]:
df_train

Unnamed: 0,id,latitude,longitude,thumb_original_url,country,sequence,captured_at,lon_bin,lat_bin,cell,...,quadtree_10_50000,quadtree_10_12500,quadtree_10_500,quadtree_10_2500,unique_region,unique_sub-region,unique_city,unique_country,creator_username,creator_id
0,3859149887465501,-43.804769,-176.614093,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,NZ,1qOCfogkQ5uE4ZvEf_4n2A,1547559856000,0,8,"(0, 8)",...,0,0,0,0,Chatham Islands_NZ,,Waitangi_NaN_Chatham Islands_NZ,NZ,roadroid,1.113362e+14
1,574181207305439,-43.796611,-176.660483,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,NZ,5rdtj6tui6qdlg8q812ul2,1567257056000,0,8,"(0, 8)",...,0,0,0,0,Chatham Islands_NZ,,Waitangi_NaN_Chatham Islands_NZ,NZ,roadroid,1.113362e+14
2,333574322129026,-43.818092,-176.578383,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,NZ,nJcst1M2bFUxSOA54CZiy9,1531231715132,0,8,"(0, 8)",...,0,0,0,0,Chatham Islands_NZ,,Waitangi_NaN_Chatham Islands_NZ,NZ,roadroid,1.113362e+14
3,636305258168031,-44.052910,-176.633065,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,NZ,BGSwcf0pLCE5bMUWdKTeqk,1662645171414,0,8,"(0, 8)",...,0,0,0,0,Chatham Islands_NZ,,Waitangi_NaN_Chatham Islands_NZ,NZ,roadroid,1.113362e+14
4,166741299029322,-43.748077,-176.329626,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,NZ,05pt8HKnLCf47UTGPrxuzB,1531143464444,0,8,"(0, 8)",...,0,0,0,0,Chatham Islands_NZ,,Waitangi_NaN_Chatham Islands_NZ,NZ,roadroid,1.113362e+14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4894679,489159948862242,62.024837,129.735016,https://scontent-cdg4-3.xx.fbcdn.net/m1/v/t6/A...,RU,ObgY4h3IqYuUf05VJrGc2g,1497516126073,85,86,"(85, 86)",...,267,1064,18200,5303,Sakha_RU,,Yakutsk_NaN_Sakha_RU,RU,trolleway,1.107632e+14
4894680,1397563823943668,62.018933,129.708063,https://scontent-cdg4-3.xx.fbcdn.net/m1/v/t6/A...,RU,PeXx-YY90BaV1kVEOuxFqg,1518138740819,85,86,"(85, 86)",...,267,1064,18200,5303,Sakha_RU,,Yakutsk_NaN_Sakha_RU,RU,vilena_ykt,1.040235e+14
4894681,821208361940118,62.024479,129.724387,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,RU,l-zUaM4G-7ybu94WRkoqvw,1497517593320,85,86,"(85, 86)",...,267,1064,18200,5303,Sakha_RU,,Yakutsk_NaN_Sakha_RU,RU,trolleway,1.107632e+14
4894682,333090098234507,62.219881,133.268821,https://scontent-cdg4-3.xx.fbcdn.net/m1/v/t6/A...,RU,uhGSMU2PCmtpdNoeYParhw,1435317097000,86,86,"(86, 86)",...,267,1064,18200,5303,Sakha_RU,,Ytyk-Kyuyel'_NaN_Sakha_RU,RU,iovan,1.016051e+14


In [34]:
# transforms strings into lists
df_one_hot_enc['encoding'] = df_one_hot_enc['encoding'].apply(ast.literal_eval)

# transforms df to dic with iso2 as key and encoding as value
one_hot_enc_dict = df_one_hot_enc.set_index('ISO2')['encoding'].to_dict()

# maps ISO2 codes to their respective encodings
def map_iso2_to_encoding(iso2):
    return [one_hot_enc_dict.get(iso2)]



In [35]:
# add new column 'country_one_hot_enc' with one_hot encodings to df_train
df_train['country_one_hot_enc'] = df_train['country'].apply(map_iso2_to_encoding)

In [36]:
df_train

Unnamed: 0,id,latitude,longitude,thumb_original_url,country,sequence,captured_at,lon_bin,lat_bin,cell,...,quadtree_10_12500,quadtree_10_500,quadtree_10_2500,unique_region,unique_sub-region,unique_city,unique_country,creator_username,creator_id,country_one_hot_enc
0,3859149887465501,-43.804769,-176.614093,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,NZ,1qOCfogkQ5uE4ZvEf_4n2A,1547559856000,0,8,"(0, 8)",...,0,0,0,Chatham Islands_NZ,,Waitangi_NaN_Chatham Islands_NZ,NZ,roadroid,1.113362e+14,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,574181207305439,-43.796611,-176.660483,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,NZ,5rdtj6tui6qdlg8q812ul2,1567257056000,0,8,"(0, 8)",...,0,0,0,Chatham Islands_NZ,,Waitangi_NaN_Chatham Islands_NZ,NZ,roadroid,1.113362e+14,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,333574322129026,-43.818092,-176.578383,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,NZ,nJcst1M2bFUxSOA54CZiy9,1531231715132,0,8,"(0, 8)",...,0,0,0,Chatham Islands_NZ,,Waitangi_NaN_Chatham Islands_NZ,NZ,roadroid,1.113362e+14,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,636305258168031,-44.052910,-176.633065,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,NZ,BGSwcf0pLCE5bMUWdKTeqk,1662645171414,0,8,"(0, 8)",...,0,0,0,Chatham Islands_NZ,,Waitangi_NaN_Chatham Islands_NZ,NZ,roadroid,1.113362e+14,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,166741299029322,-43.748077,-176.329626,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,NZ,05pt8HKnLCf47UTGPrxuzB,1531143464444,0,8,"(0, 8)",...,0,0,0,Chatham Islands_NZ,,Waitangi_NaN_Chatham Islands_NZ,NZ,roadroid,1.113362e+14,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4894679,489159948862242,62.024837,129.735016,https://scontent-cdg4-3.xx.fbcdn.net/m1/v/t6/A...,RU,ObgY4h3IqYuUf05VJrGc2g,1497516126073,85,86,"(85, 86)",...,1064,18200,5303,Sakha_RU,,Yakutsk_NaN_Sakha_RU,RU,trolleway,1.107632e+14,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4894680,1397563823943668,62.018933,129.708063,https://scontent-cdg4-3.xx.fbcdn.net/m1/v/t6/A...,RU,PeXx-YY90BaV1kVEOuxFqg,1518138740819,85,86,"(85, 86)",...,1064,18200,5303,Sakha_RU,,Yakutsk_NaN_Sakha_RU,RU,vilena_ykt,1.040235e+14,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4894681,821208361940118,62.024479,129.724387,https://scontent-cdg4-1.xx.fbcdn.net/m1/v/t6/A...,RU,l-zUaM4G-7ybu94WRkoqvw,1497517593320,85,86,"(85, 86)",...,1064,18200,5303,Sakha_RU,,Yakutsk_NaN_Sakha_RU,RU,trolleway,1.107632e+14,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4894682,333090098234507,62.219881,133.268821,https://scontent-cdg4-3.xx.fbcdn.net/m1/v/t6/A...,RU,uhGSMU2PCmtpdNoeYParhw,1435317097000,86,86,"(86, 86)",...,1064,18200,5303,Sakha_RU,,Ytyk-Kyuyel'_NaN_Sakha_RU,RU,iovan,1.016051e+14,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [37]:
# extracts image id from given image file name by removing the file extension.
def get_img_id(img_name):
    return np.int64(img_name.replace(".jpg",""))


In [38]:
#create a DatasetDict by merging split dfs of descpriton and images encoding with a training df and converting them into HuggingFace Dataset 

full_dataset = DatasetDict()
# iterate over each split identifier and corresponding DataFrame
for id, split in tqdm(zip(splits, dfs_list)):
    
    split["id"] = split["img_id"].apply(get_img_id) # extract image id by removing file extension and converting to integer
    mergesplit = split.merge(df_train, how="left", on="id") # merge split df with training df on the 'id' column
    full_dataset[id] = Dataset.from_pandas(mergesplit)   # convert merged df to a HuggingFace Dataset and add it to DatasetDict

5it [00:43,  8.79s/it]


In [None]:
full_dataset

In [40]:
from dotenv import load_dotenv
load_dotenv()
import os
HF_AUTH_TOKEN = os.getenv("HF_AUTH_TOKEN")

In [None]:
# push created HuggingFace Dataset to HuggingFace
full_dataset.push_to_hub("gips-mai/osv5m_ann", token=HF_AUTH_TOKEN)