In [1]:
import os
import json
import matplotlib
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold

In [2]:
DATA_PATH = "../raw_data/" # raw_data path

In [10]:
topics = pd.read_csv(DATA_PATH + "topics.csv")
content = pd.read_csv(DATA_PATH + "content.csv")
correlations = pd.read_csv(DATA_PATH + "correlations.csv")

topics = topics[topics.has_content==True] # Keep only the topics with content

topics.head()

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
5,t_0008768bdee6,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,5223e0,supplemental,4,gu,t_0da7a331d666,True


## Split

In [11]:
# All topics with category=='source' are assigned to the train fold because there are no topics with category=='source' in the hidden test set.
topics_train = topics[topics['category'] == "source"][["id"]] 
topics_train["fold"] = "train"

topics_train.head()

Unnamed: 0,id,fold
0,t_00004da3a1b2,train
2,t_00068291e9a4,train
3,t_00069b63a70a,train
4,t_0006d41a73a8,train
9,t_000feba42136,train


In [13]:
# valid df
topics_val = topics[topics['category'] != "source"].reset_index(drop=True)
print('topics_val', topics_val.head())

# StratifiedGroupKFold
sgkf = StratifiedGroupKFold(random_state=1773,
                            n_splits=4,
                            shuffle=True)

# Perform stratified sampling based on language and group the samples based on id
split_idxs = list(sgkf.split(topics_val["id"], topics_val["language"], groups=topics_val["id"]))[0]

split_idxs

topics_val                id                           title  \
0  t_0008768bdee6               100 સુધીનો સરવાળો   
1  t_0008a1bd84ba       12. 20: Bird Reproduction   
2  t_000d1fb3f2f5              2.1.2 - Logarithms   
3  t_00102869fbcb          Triangles and polygons   
4  t_0012a45fa09c  Quiz: materials and techniques   

                                         description channel      category  \
0       37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.  5223e0  supplemental   
1                                                NaN  ebc86c  supplemental   
2                                                NaN  e77b55       aligned   
3  Learning outcomes: students must be able to so...  a91e32       aligned   
4                                                NaN  2ee29d       aligned   

   level language          parent  has_content  
0      4       gu  t_0da7a331d666         True  
1      5       en  t_c44ac9711007         True  
2      5       en  t_b897d168db90         True  
3    

(array([    0,     2,     3, ..., 25000, 25001, 25002]),
 array([    1,     5,     9, ..., 24990, 24995, 24998]))

In [14]:
# Create training set
topics_add_train = topics_val.iloc[split_idxs[0]].reset_index(drop=True)[["id"]]
topics_add_train["fold"] = "train"

topics_train = pd.concat([topics_train, topics_add_train], ignore_index=True)

print('topics_train \n', topics_train)

# Create validation set
topics_holdout = topics_val.iloc[split_idxs[1]].reset_index(drop=True)[["id"]]
topics_holdout["fold"] = "test"

print('topics_holdout \n', topics_holdout)

topics_train 
                    id   fold
0      t_00004da3a1b2  train
1      t_00068291e9a4  train
2      t_00069b63a70a  train
3      t_0006d41a73a8  train
4      t_000feba42136  train
...               ...    ...
55261  t_fff05585df72  train
55262  t_fff51448598c  train
55263  t_fff7782561f4  train
55264  t_fff9e5407d13  train
55265  t_fffe14f1be1e  train

[55266 rows x 2 columns]
topics_holdout 
                   id  fold
0     t_0008a1bd84ba  test
1     t_0016d30772f3  test
2     t_001edc523bd1  test
3     t_002eec45174c  test
4     t_003c1782b8c2  test
...              ...   ...
6246  t_ffc6ba0459d6  test
6247  t_ffcabc094a95  test
6248  t_ffcf958baa88  test
6249  t_ffe8df837f62  test
6250  t_fff1f01cfeb0  test

[6251 rows x 2 columns]


In [7]:
# Check the language distribution of the training set
topics[topics.id.isin(topics_train.id)].language.value_counts()

en     24045
es     10124
pt      3363
ar      3173
fr      2938
bg      2420
sw      2063
gu      1699
bn      1604
hi      1264
it       722
zh       672
mr       239
fil      163
as       112
my       110
km       104
kn        88
te        66
ur        54
or        51
ta        44
pnb       40
swa       33
pl        28
tr        26
ru        21
Name: language, dtype: int64

In [8]:
# Check the language distribution of the validation set
topics[topics.id.isin(topics_holdout.id)].language.value_counts()

en     4008
es     1645
bn      127
gu      110
hi      109
fr       96
pt       62
fil      61
sw       19
as       14
Name: language, dtype: int64

In [15]:
# Save the complete dataset
split_df = pd.concat([topics_train, topics_holdout], ignore_index=True)
split_df.to_csv('train_test_splits.csv', index=False)

split_df.head()

Unnamed: 0,id,fold
0,t_00004da3a1b2,train
1,t_00068291e9a4,train
2,t_00069b63a70a,train
3,t_0006d41a73a8,train
4,t_000feba42136,train


Done !