In [10]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split, StratifiedKFold

In [5]:
german = pd.read_csv("TrainEnglish2German.csv")
print(german.columns)
german = pd.DataFrame({"guid": german['guid'],
                       "text_a": german['translation'],
                       "text_b": pd.Series("a", index=np.arange(len(german))),
                       "theme1": german["theme1"],
                       "theme2": german["theme2"]})

german.head()

Index(['guid', 'text', 'translation', 'alpha', 'theme1', 'theme2'], dtype='object')


Unnamed: 0,guid,text_a,text_b,theme1,theme2
0,36,Einäscherte Überreste des Massenjägers von Las...,a,7,99
1,47,Florida-Schütze ein gestörter Einzelgänger mit...,a,4,6
2,68,"Der Teenager aus Vernon Hills, der beschuldigt...",a,6,99
3,70,"Griffith-Student beschuldigt, versehentlich ge...",a,5,99
4,98,Exklusiv: Gruppenchat-Nachrichten zeigen Schüt...,a,4,6


In [6]:
for i,frame in enumerate(german['theme2']):
    if math.isnan(frame) or frame == 99:
        german.iat[i,4] = german.iat[i,3]

display(german.head())

Unnamed: 0,guid,text_a,text_b,theme1,theme2
0,36,Einäscherte Überreste des Massenjägers von Las...,a,7,7
1,47,Florida-Schütze ein gestörter Einzelgänger mit...,a,4,6
2,68,"Der Teenager aus Vernon Hills, der beschuldigt...",a,6,6
3,70,"Griffith-Student beschuldigt, versehentlich ge...",a,5,5
4,98,Exklusiv: Gruppenchat-Nachrichten zeigen Schüt...,a,4,6


In [7]:
size = len(german)
labels = np.zeros((size, 9))
labels[np.arange(size), german['theme1']-1] = 1
labels[np.arange(size), (german['theme2']-1).astype(int)] = 1

display(labels[:5,:])

array([[0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0.]])

In [8]:
labels_df = pd.DataFrame(labels)
labels_df.columns = np.arange(9)+1
labels_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [9]:
german_wide = pd.DataFrame({"guid": german['guid'],
                             "text_a": german['text_a'],
                             "text_b": german['text_b']})
german_wide = pd.concat([german_wide, labels_df], axis = 1)
display(german_wide)

Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
0,36,Einäscherte Überreste des Massenjägers von Las...,a,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,47,Florida-Schütze ein gestörter Einzelgänger mit...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,68,"Der Teenager aus Vernon Hills, der beschuldigt...",a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,70,"Griffith-Student beschuldigt, versehentlich ge...",a,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,98,Exklusiv: Gruppenchat-Nachrichten zeigen Schüt...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
5,129,Anwälte bezeichnen US-Anklage wegen Waffengewa...,a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,160,Der Parkland School Shooting-Verdächtige Nikol...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,162,Ein Schütze aus Florida zeigte zu Hause Gewalt...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,205,"Gunman kaufte beiläufig Wasser, bevor das Mass...",a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,228,Der Verdächtige der Schießerei in der Schule v...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#create stratified kfolds, k = 5
k = 5
skf = StratifiedKFold(n_splits=k, random_state=50, shuffle=True)
X = german['text_a']
y = german['theme1'] 
skf.get_n_splits(X, y)


for idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    print("Fold: ", idx, "TRAIN:", len(train_index), "TEST:", len(test_index))
    train, test = german_wide.loc[train_index], german_wide.loc[test_index]
    display(train.head())
    display(test.head())
    
    path = str(idx)+'/'
    train.to_csv(path+'train.tsv', sep='\t', index=False, header=False)
#     test.to_csv(path+'test_labeled.tsv', sep = '\t', index=False, header=False)
#     test = pd.DataFrame({'ImageID': test['ImageID'], 'news_title': test['news_title']})
#     test.to_csv(path+'test.tsv', sep='\t', index=False, header=True)

Fold:  0 TRAIN: 1037 TEST: 263


Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
0,36,Einäscherte Überreste des Massenjägers von Las...,a,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,47,Florida-Schütze ein gestörter Einzelgänger mit...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,68,"Der Teenager aus Vernon Hills, der beschuldigt...",a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,70,"Griffith-Student beschuldigt, versehentlich ge...",a,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,98,Exklusiv: Gruppenchat-Nachrichten zeigen Schüt...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
8,205,"Gunman kaufte beiläufig Wasser, bevor das Mass...",a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
28,579,New Mexico Compound: Die Gebühren für mutmaßli...,a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29,586,Jenelle Evans lügt über das Ziehen einer Waffe...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
34,699,"Der kalifornische Lehrer, der wegen Straßenwut...",a,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
43,855,"Weißer Mann nimmt Pistole heraus, um schwarze ...",a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Fold:  1 TRAIN: 1038 TEST: 262


Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
0,36,Einäscherte Überreste des Massenjägers von Las...,a,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,70,"Griffith-Student beschuldigt, versehentlich ge...",a,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,98,Exklusiv: Gruppenchat-Nachrichten zeigen Schüt...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
5,129,Anwälte bezeichnen US-Anklage wegen Waffengewa...,a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,160,Der Parkland School Shooting-Verdächtige Nikol...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
1,47,Florida-Schütze ein gestörter Einzelgänger mit...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,68,"Der Teenager aus Vernon Hills, der beschuldigt...",a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13,310,"Familie, die in Florida erschossene Verdächtig...",a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
14,317,"Dies ist eine sehr gefährliche Person “, sagt ...",a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
16,372,Was haben die meisten Massenschützen gemeinsam...,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Fold:  2 TRAIN: 1040 TEST: 260


Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
0,36,Einäscherte Überreste des Massenjägers von Las...,a,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,47,Florida-Schütze ein gestörter Einzelgänger mit...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,68,"Der Teenager aus Vernon Hills, der beschuldigt...",a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,70,"Griffith-Student beschuldigt, versehentlich ge...",a,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,129,Anwälte bezeichnen US-Anklage wegen Waffengewa...,a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
4,98,Exklusiv: Gruppenchat-Nachrichten zeigen Schüt...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
9,228,Der Verdächtige der Schießerei in der Schule v...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
12,264,"Mann beschuldigt, auf einen schwarzen Teenager...",a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15,362,Offiziere der Central Michigan University hatt...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
17,384,Dies ist ein unausgesprochener Schuldiger von ...,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Fold:  3 TRAIN: 1042 TEST: 258


Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
0,36,Einäscherte Überreste des Massenjägers von Las...,a,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,47,Florida-Schütze ein gestörter Einzelgänger mit...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,68,"Der Teenager aus Vernon Hills, der beschuldigt...",a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,98,Exklusiv: Gruppenchat-Nachrichten zeigen Schüt...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
6,160,Der Parkland School Shooting-Verdächtige Nikol...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
3,70,"Griffith-Student beschuldigt, versehentlich ge...",a,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,129,Anwälte bezeichnen US-Anklage wegen Waffengewa...,a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,162,Ein Schütze aus Florida zeigte zu Hause Gewalt...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11,239,Überprüfung von Tatsachen und Unwahrheiten in ...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
19,404,"Der Bruder von Nikolas Cruz bedauert, dass er ...",a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Fold:  4 TRAIN: 1043 TEST: 257


Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
1,47,Florida-Schütze ein gestörter Einzelgänger mit...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,68,"Der Teenager aus Vernon Hills, der beschuldigt...",a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,70,"Griffith-Student beschuldigt, versehentlich ge...",a,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,98,Exklusiv: Gruppenchat-Nachrichten zeigen Schüt...,a,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
5,129,Anwälte bezeichnen US-Anklage wegen Waffengewa...,a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Unnamed: 0,guid,text_a,text_b,1,2,3,4,5,6,7,8,9
0,36,Einäscherte Überreste des Massenjägers von Las...,a,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,160,Der Parkland School Shooting-Verdächtige Nikol...,a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10,231,"New Vermont Gesetz verwendet, um die Schule Sc...",a,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,392,"Einige wollten einen Schießverdächtigen, der 2...",a,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
20,438,Was das FBI über den beschuldigten Florida Sch...,a,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
