In [56]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import DanishStemmer
import re
import os
import seaborn as sns

In [57]:
# Hent danske stopwords
nltk.download('stopwords')
stopwords = stopwords.words('danish')
stemmer = DanishStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jjo4da\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
def text_process(directory):
    """
    Indlæser json filer fra en folder go returnerer en dataframe med kle_nr og text kolonne
    """
    data_lst = []
    for file in os.listdir(directory):
        if file.endswith('.json'):
            with open(os.path.join(directory, file), encoding='utf-8') as f:
                data = json.load(f)
            
            kle_nr = data['classes'][0]['id']
            text_ord = re.sub('\W+|\d+', ' ', data['text'])
            text_stop = ' '.join([ord for ord in text_ord.split() if ord not in stopwords])
            text_stem = ' '.join([stemmer.stem(ord) for ord in text_stop.split()])
            text_result = ' '.join([ord for ord in text_stem.split() if '_' not in ord])
            
            data_lst.append([kle_nr, text_result])
    df = pd.DataFrame(data_lst, columns=['kle_nr', 'text'])
    return df

In [59]:
# Lav en samlet dataframe med data fra kl og syddjurs
df_kl = text_process('data/KL_Blanketter_med_kle/kl_blanketter/')
df_syddjurs = text_process('data/syddjurs_med_kle/web/')
frames = [df_kl, df_syddjurs]
df_samlet = pd.concat(frames)

In [60]:
df_samlet['kle_nr'].value_counts().head()

32.30.04    72
01.03.03    55
02.34.02    45
54.15.28    41
29.09.00    40
Name: kle_nr, dtype: int64

In [99]:
# Imports for ML delen
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [100]:
#Splitte i features og labels
X = df_samlet['text']
y = df_samlet['kle_nr']

# LabelEncode kle numre
le = LabelEncoder()
y_le = le.fit_transform(y)

# Splitte i training og testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_le, test_size=0.2)