In [2]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_path = 'path'

train_data = pd.read_csv(train_path, sep = "\t")

In [4]:
train_data.columns = ['Text', 'Label']
train_data.head()

Unnamed: 0,Text,Label
0,heimer de vo de weltsche truppe womer de hei g...,BE
1,ghüroote händ ond khinder gha händ,BS
2,töchterli us franggfurt gsi wo aber e schüeler...,BS
3,eer hend soo öich goots vöu besser,LU
4,art goldau,LU


In [5]:
train_data.shape

(14645, 2)

In [6]:
train_data['Label'].unique()

array(['BE', 'BS', 'LU', 'ZH'], dtype=object)

In [7]:
dev_path = 'path'

dev_data = pd.read_csv(dev_path, sep = "\t")

dev_data.columns = ['Text', 'Label']
dev_data.head()

Unnamed: 0,Text,Label
0,ich bii jo,BS
1,poschte welä wonner ned überchoo hät oder wais...,LU
2,das chönt ech also ned säägen ech,LU
3,ond den hend sii,LU
4,d leerer ond d gaischlichkeit,LU


In [8]:
dev_data.shape

(4657, 2)

As a control, let's quickly run a multinomial naive bayes classifier using the train and dev splits:

In [9]:
def tokenize_text_roberta(df):
    # Load RoBERTa tokenizer
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

    # Tokenize the text column
    df['Tokenized_Text'] = df["Text"].apply(lambda x: tokenizer.tokenize(x))

    print(df.head())

    return df

tokenize_text_roberta(train_data)
tokenize_text_roberta(dev_data)

tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 1.67kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
config.json: 100%|██████████| 615/615 [00:00<00:00, 34.2kB/s]
sentencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 7.95MB/s]
tokenizer.json: 100%|██████████| 9.10M/9.10M [00:00<00:00, 10.9MB/s]


                                                Text Label  \
0  heimer de vo de weltsche truppe womer de hei g...    BE   
1                 ghüroote händ ond khinder gha händ    BS   
2  töchterli us franggfurt gsi wo aber e schüeler...    BS   
3                 eer hend soo öich goots vöu besser    LU   
4                                         art goldau    LU   

                                      Tokenized_Text  
0  [▁heim, er, ▁de, ▁vo, ▁de, ▁wel, t, sche, ▁tru...  
1  [▁g, hü, root, e, ▁hän, d, ▁ond, ▁khi, nder, ▁...  
2  [▁tö, chter, li, ▁us, ▁fran, gg, fur, t, ▁g, s...  
3  [▁e, er, ▁, hend, ▁soo, ▁ö, ich, ▁go, ots, ▁, ...  
4                                  [▁art, ▁gold, au]  
                                                Text Label  \
0                                         ich bii jo    BS   
1  poschte welä wonner ned überchoo hät oder wais...    LU   
2                  das chönt ech also ned säägen ech    LU   
3                                   ond den hend 

Unnamed: 0,Text,Label,Tokenized_Text
0,ich bii jo,BS,"[▁ich, ▁bi, i, ▁jo]"
1,poschte welä wonner ned überchoo hät oder wais...,LU,"[▁pos, chte, ▁wel, ä, ▁won, ner, ▁ned, ▁über, ..."
2,das chönt ech also ned säägen ech,LU,"[▁das, ▁ch, ön, t, ▁e, ch, ▁also, ▁ned, ▁sää, ..."
3,ond den hend sii,LU,"[▁ond, ▁den, ▁, hend, ▁sii]"
4,d leerer ond d gaischlichkeit,LU,"[▁d, ▁leer, er, ▁ond, ▁d, ▁ga, isch, lichkeit]"
...,...,...,...
4652,das isch mi het ebe oo gseit mi het mi het nüü...,BE,"[▁das, ▁, isch, ▁mi, ▁het, ▁e, be, ▁oo, ▁g, se..."
4653,dert het me glächlet,BE,"[▁der, t, ▁het, ▁me, ▁g, lä, ch, let]"
4654,auf höheren befääl entlaa woorde,BE,"[▁auf, ▁höhere, n, ▁be, f, ää, l, ▁en, tla, a,..."
4655,nochethäär de ganz die ganz abe do die ganz,LU,"[▁noch, eth, äär, ▁de, ▁ganz, ▁die, ▁ganz, ▁a,..."


In [12]:
train_tok[:10]

0    ▁heim er ▁de ▁vo ▁de ▁wel t sche ▁trup pe ▁wo ...
1       ▁g hü root e ▁hän d ▁ond ▁khi nder ▁gha ▁hän d
2    ▁tö chter li ▁us ▁fran gg fur t ▁g si ▁wo ▁abe...
3      ▁e er ▁ hend ▁soo ▁ö ich ▁go ots ▁ vö u ▁besser
4                                        ▁art ▁gold au
5                          ▁propaganda a ▁uu szt rät e
6    ▁und ▁da ▁het ▁s ▁o ▁nach ▁a a uko hou wu u ch...
7    ▁was ▁me ▁wo t ▁ uf g ää ▁do ▁hä ts ▁e ▁huu fe...
8    ▁es ▁w öör sch tli ▁plo be ▁de ▁ hend s ▁au ▁ ...
9                ▁ isch ▁und ▁d än n ▁hä t ▁do o ▁scho
Name: Tokenized_Text, dtype: object

In [15]:
tf= TfidfVectorizer()

train_tok = train_data['Tokenized_Text'].apply(lambda tokens: ' '.join(tokens)) 
dev_tok = dev_data['Tokenized_Text'].apply(lambda tokens: ' '.join(tokens)) 

train_vec_x = tf.fit_transform(train_tok)
dev_vec_x = tf.transform(dev_tok)
train_y = train_data['Label']
dev_y = dev_data['Label']
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(train_vec_x, train_y)


y_pred = naive_bayes_classifier.predict(dev_vec_x)


accuracy = accuracy_score(dev_y, y_pred)
print("Accuracy:", accuracy)


print(classification_report(dev_y, y_pred))

Accuracy: 0.5881468756710329
              precision    recall  f1-score   support

          BE       0.55      0.58      0.57      1066
          BS       0.75      0.53      0.62      1572
          LU       0.57      0.47      0.52      1079
          ZH       0.51      0.81      0.63       940

    accuracy                           0.59      4657
   macro avg       0.59      0.60      0.58      4657
weighted avg       0.61      0.59      0.59      4657



Accuracy of .59 is about what's to be expected. The highest accuracy for this task in 2018 (using the exact same data) was .68, the lowest was .26. 