In [7]:
# !pip install -q iterative-stratification

In [8]:
import pandas as pd
from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [9]:
df = pd.read_feather("../input/feedback-prize-2021/train.fea")
if 'kfold' in df.columns:
   df.drop('kfold', axis=1, inplace=True)
if 'cluster' in df.columns:
   df.drop('cluster', axis=1, inplace=True)
dfx = pd.get_dummies(df, columns=["discourse_type"]).groupby(["id"], as_index=False).sum()

In [10]:
cluster = pd.read_csv("../input/feedback-prize-2021/cluster.csv")
dfx = dfx.merge(cluster, on=['id'], how='left')

In [11]:
dfx.columns

Index(['id', 'index', 'discourse_id', 'discourse_start', 'discourse_end',
       'essay_len', 'essay_words', 'start', 'end', 'discourse_words',
       'gap_before', 'gap_length', 'gap_end_length', 'discourse_type_id',
       'Unnamed: 0', 'discourse_type_Claim',
       'discourse_type_Concluding Statement', 'discourse_type_Counterclaim',
       'discourse_type_Evidence', 'discourse_type_Lead',
       'discourse_type_Position', 'discourse_type_Rebuttal', 'text',
       'cluster'],
      dtype='object')

In [12]:
for i in range(15):
  dfx[f'cluster_{i}'] = dfx.cluster.apply(lambda x: int(x == i))

In [13]:
cols = [c for c in dfx.columns if c.startswith('cluster') or c.startswith("discourse_type_") or c == "id" and c != "discourse_type_num"]
cols

['id',
 'discourse_type_id',
 'discourse_type_Claim',
 'discourse_type_Concluding Statement',
 'discourse_type_Counterclaim',
 'discourse_type_Evidence',
 'discourse_type_Lead',
 'discourse_type_Position',
 'discourse_type_Rebuttal',
 'cluster',
 'cluster_0',
 'cluster_1',
 'cluster_2',
 'cluster_3',
 'cluster_4',
 'cluster_5',
 'cluster_6',
 'cluster_7',
 'cluster_8',
 'cluster_9',
 'cluster_10',
 'cluster_11',
 'cluster_12',
 'cluster_13',
 'cluster_14']

In [14]:
dfx = dfx[cols]
seed = 20201021
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
labels = [c for c in dfx.columns if (c != "id" and c != "cluster")]
dfx_labels = dfx[labels]
dfx["kfold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
  print(len(trn_), len(val_))
  dfx.loc[val_, "kfold"] = fold

df = df.merge(dfx[["id", "kfold", "cluster"]], on="id", how="left")
print(df.kfold.value_counts())
df.to_csv("../input/feedback-prize-2021/train_folds.csv", index=False)

12485 3109
12471 3123
12479 3115
12475 3119
12466 3128
3    29027
1    28946
2    28876
0    28773
4    28671
Name: kfold, dtype: int64


In [15]:
x = df[['id', 'kfold', 'cluster']].drop_duplicates()

In [17]:
display(df.groupby(["kfold", 'discourse_type'], as_index=False).count().pivot(index='discourse_type', columns='kfold', values='id').T)

discourse_type,Claim,Concluding Statement,Counterclaim,Evidence,Lead,Position,Rebuttal
kfold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,9988,2705,1168,9114,1861,3080,857
1,10061,2696,1187,9172,1861,3080,889
2,10023,2700,1168,9168,1862,3078,877
3,10202,2705,1149,9166,1861,3087,857
4,9934,2699,1145,9082,1860,3094,857


In [18]:
display(x.groupby(["kfold", 'cluster'], as_index=False).count().pivot(index='cluster', columns='kfold', values='id').T)

cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
kfold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,168,300,185,220,363,277,138,164,140,297,166,195,175,169,152
1,168,313,186,220,363,276,137,164,141,298,167,195,175,168,152
2,168,304,185,220,363,277,137,164,141,298,167,195,176,168,152
3,168,309,186,220,363,276,137,163,141,298,167,195,175,169,152
4,168,318,186,220,362,277,137,164,140,298,167,195,175,169,152


In [19]:
x.to_csv("../input/feedback-prize-2021/folds.csv")