In [4]:

import csv
import collections

from argparse import ArgumentParser
from sklearn.model_selection import train_test_split
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os


In [2]:
!jupyter nbconvert --to script "BERT Kaggle Notebook.ipynb"

[NbConvertApp] Converting notebook BERT Kaggle Notebook.ipynb to script
[NbConvertApp] Writing 5506 bytes to BERT Kaggle Notebook.py


In [2]:
SPLIT = .15

In [3]:

labels = []
texts = []

# read full dataset file
with open("/kaggle/input/10kgnad/articles.csv",  "r", encoding='utf-8',) as csvfile:
    reader = csv.reader(csvfile, delimiter=';', quotechar='\'')
    for row in reader:
        labels.append(row[0])
        texts.append(row[1])

tr_Df=pd.concat([pd.Series(texts, name="Text"), pd.Series(labels,name='Label')], axis=1)
# split dataset
X_train, X_test, y_train, y_test = train_test_split( tr_Df['Text'], tr_Df['Label'], test_size=SPLIT, random_state=42,stratify=tr_Df['Label'])

# write train and test datasets
train=pd.DataFrame(pd.concat([X_train,y_train],axis=1))
test=pd.DataFrame(pd.concat([X_test,y_test],axis=1))

# ## BERT

# In[5]:


from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch


# In[6]:


sms = train.Text.values
[print(len(sm.split(" "))) for sm in sms]


# In[7]:


def gen_id_mask(data):
    sms = data.Text.values
    sms = ["[CLS] " + sm + " [SEP]" for sm in sms]
    
    tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')

    tokenized_texts = [tokenizer.tokenize(sm) for sm in sms]

    MAX_LEN = 512
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    print('success')
    return (input_ids,attention_masks)


# In[8]:


train_input_ids,train_att_masks=gen_id_mask(train)
test_input_ids, test_att_masks=gen_id_mask(test)




In [4]:


model = BertModel.from_pretrained('dbmdz/bert-base-german-cased', output_hidden_states=True)


# In[ ]:


# In[ ]:

train_sentences_embeddings=[]
test_sentences_embeddings=[]
model = model.eval()

for i in range(test_input_ids.shape[0]):
    test_inputs = torch.tensor(test_input_ids[i:i+1])
    #train_labels = torch.tensor(train_labels)
    test_masks = torch.tensor(test_att_masks[i:i+1])
    with torch.no_grad():
        encoded_layers = model(input_ids=test_inputs, attention_mask=test_masks)

    hidden_states = encoded_layers[2]

    token_vecs = hidden_states[11][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    print(sentence_embedding.size())
    test_sentences_embeddings.append(sentence_embedding.numpy())


print("Test Done")

for i in range(train_input_ids.shape[0]):
    train_inputs = torch.tensor(train_input_ids[i:i+1])
    #train_labels = torch.tensor(train_labels)
    train_masks = torch.tensor(train_att_masks[i:i+1])
    with torch.no_grad():
        encoded_layers = model(input_ids=train_inputs, attention_mask=train_masks)

    hidden_states = encoded_layers[2]
    token_vecs = hidden_states[11][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    #print(sentence_embedding.size())
    train_sentences_embeddings.append(sentence_embedding.numpy())

Unnamed: 0,Text,Label
9321,Die Energiewirtschaft hat ihre Strategie bis 2...,Wirtschaft
5703,"Östereich siegte in Podgorica zuerst gegen 12,...",Sport
8630,Sagis Vertreter Phillip Burns und Barry Gilber...,Wirtschaft
44,Keine offizielle Bestätigung über Verhandlungs...,Etat
537,"Roland Düringer in Autorevue TV, vom Leiden de...",Etat
...,...,...
2184,Auch Einreise- und Vermögenssperren gegen Luka...,International
5236,In der chinesischen Hauptstadt fahren Österrei...,Panorama
220,"Strache will weltoffen sein, atmete man in dem...",Etat
5186,25-Jähriger wollte TV-Sender mit Sprengstoffgü...,Panorama


## BERT

In [5]:
train_sentences_embeddings = np.stack(train_sentences_embeddings, axis=0)
train_sentences_embeddings.shape
test_sentences_embeddings = np.stack(test_sentences_embeddings, axis=0)
test_sentences_embeddings.shape

In [9]:
#np.savetxt("BERT_last_hidden_train_sentences_embeddings.out", train_sentences_embeddings, delimiter=",")
#y_train.to_csv("y_train_labels.out")
#np.savetxt("BERT_last_hidden_test_sentences_embeddings.out", test_sentences_embeddings, delimiter=",")
#y_test.to_csv("y_test_labels.out")


In [5]:
y_train=pd.read_csv("y_train_labels.out", index_col=0, header=None)
y_train=y_train.iloc[:,0]
y_test=pd.read_csv("y_test_labels.out", index_col=0, header=None) 
y_test=y_test.iloc[:,0]

In [6]:
train_sentences_embeddings = np.genfromtxt('BERT_last_hidden_train_sentences_embeddings.out', delimiter=',')
test_sentences_embeddings = np.genfromtxt('BERT_last_hidden_test_sentences_embeddings.out', delimiter=',')

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
clf = RandomForestClassifier(max_depth=15)
clf.fit(train_sentences_embeddings,y_train)
f1_score(clf.predict(train_sentences_embeddings),y_train, average='weighted')

0.9989695154391335

In [7]:
y_test.value_counts()

Panorama         252
Web              251
International    227
Wirtschaft       212
Sport            180
Inland           152
Etat             100
Wissenschaft      86
Kultur            81
Name: 1, dtype: int64

In [None]:
print(clf.score(test_sentences_embeddings,["Panorama"]*y_test.shape[0]))
f1_score(clf.predict(test_sentences_embeddings),["Panorama"]*y_test.shape[0], average='weighted')

In [9]:
print(clf.score(test_sentences_embeddings,y_test))
f1_score(clf.predict(test_sentences_embeddings),y_test, average='weighted')

0.8170019467878001


0.8174209868081426

In [10]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(C=1000)
clf.fit(train_sentences_embeddings,y_train)
f1_score(clf.predict(train_sentences_embeddings),y_train, average='weighted')

NameError: name 'embds' is not defined

In [None]:
f1_score(clf.predict(test_sentences_embeddings),y_test, average='weighted')