# Utlyste stillinger til Nav med sklearn

Det som gjøres er inspirert av tutorial [https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f](https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f)

Datasett er hentet fra [https://data.norge.no/organisasjoner/arbeids-og-velferdsetaten-nav](https://data.norge.no/organisasjoner/arbeids-og-velferdsetaten-nav)

Det anbefales å kjøre prosjektet i docker på imaget [continuumio/anaconda3](https://hub.docker.com/r/continuumio/anaconda3/) eller følge instruksjonene i repoet. 



Spørsmål tas gjerne imot hos vikfand@gmail.com

In [None]:
# Set up with packages and expected file structure

!pip install wget html
!mkdir data

In [None]:
import os
import sys
import sklearn
import pandas as pd
import numpy as np
import xml
import re
from time import time
import matplotlib.pyplot as plt
from pprint import pprint
import html


print(sys.version)
data_dir = os.path.join('.', 'data')

In [None]:
from lib.data import download_data

# Download the data (will not download if it's already downloaded)
download_data(data_dir)

In [None]:
from lib.load_dataset import load_datasets

# Load datasets into a pandas DataFrame from .csv files
df = load_datasets(data_dir, 2015, 2015)

print(df.shape)
df.head()

In [None]:
from lib.preprocessing import remove_tags, unescape_html, remove_whitespace

# Select subset of columns
col = [
    'stilling_kilde', 
    'stillingsnummer', 
    'stillingsbeskrivelse', 
    'yrke_grovgruppe',
    'arbeidssted_fylkesnummer',
    'arbeidssted_kommunenummer',
    'virksomhet_navn',
    'arbeidssted_fylke',
]
df = df[col]


# Preprocess text and add some columns
df['stillingsbeskrivelse'] = df['stillingsbeskrivelse']\
    .map(remove_tags)\
    .map(unescape_html)\
    .map(remove_whitespace)
df['is_from_nav'] = df['stilling_kilde'].map(lambda x: x=='Reg av arb.giver på nav.no')
df['from_media'] = df['stilling_kilde'].map(lambda x: x=='Annonsert i media')
df['is_healthcare'] = df['yrke_grovgruppe'].map(lambda x: x=='Helse, pleie og omsorg')
df['is_industrial'] = df['yrke_grovgruppe'].map(lambda x: x=='Industriarbeid')
df['contains_nav'] = df['stillingsbeskrivelse'].map(lambda x: 'nav' in x)

print(df.shape)
df.head()




In [None]:
from sklearn.model_selection import train_test_split

data_subset_size = len(df) # Just use a small dataset when experimenting or it will take too much time
target_column_name = 'yrke_grovgruppe' # Which label to predict
random_seed = 305

# Make training and test sets of the data. 
def get_train_and_test_sets(df, sample_size=None, test_size=0.2, random_seed=305):
    if not sample_size:
        sample_size = len(df) // 10
    data_sample = df.sample(n=sample_size, random_state=random_seed)
    return train_test_split(
        data_sample, 
        data_sample[target_column_name],
        test_size=.2,
        random_state=random_seed
    )

X_train, X_test, y_train, y_test = get_train_and_test_sets(df)
X_train

In [None]:
# Make the pipeline that transforms our data and trains the classifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=.25)),
    ('svc', LinearSVC()),
])

pipeline.fit(X_train['stillingsbeskrivelse'].values, y_train)

In [None]:
# Test the data by running the test data set on the trained pipeline
predicted = pipeline.predict(X_test['stillingsbeskrivelse'].values)
pprint(predicted)
np.mean(predicted == y_test)

In [None]:
from sklearn import metrics

# Print some stats of our results
print(metrics.classification_report(y_test, predicted))
print(metrics.accuracy_score(y_test, predicted))

In [None]:
# Try the pipeline with some custom data

prediction = pipeline.predict(['difi leikanger'])
pprint(prediction)

In [None]:
# Analyze which words are most characteristic of each category

tfidf = TfidfVectorizer()

df_sample = df.sample(n=100) # Will take very much time and memory if n>100000

features = tfidf.fit_transform(df_sample['stillingsbeskrivelse'])
labels = df_sample['yrke_grovgruppe']
features.shape

from sklearn.feature_selection import chi2
N = 10
for label in labels.drop_duplicates():
    features_chi2 = chi2(features, labels == label)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    print(u"# '{}':".format(unicode(label)))
    print("  . Most correlated unigrams:\n. {}".format(u'\n. '.join(unigrams[-N:]).encode('utf-8')))
