# Import necessary libraries

In [None]:
import json
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# Choose a Swedish stemmer and Swedish stop words to handle Swedish text

stemmer = SnowballStemmer(language = 'swedish')
stop_words = set(stopwords.words("swedish"))

# Read data

In [None]:
# Function to read data from json-file and store in Pandas Dataframe

def get_data_from_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
        data = pd.DataFrame(data)
    return data

In [None]:
path = 'full_dataset.json'
data = get_data_from_json(path)

# Preprocessing

### Preprocessing functions:

In [None]:
# Stemming of Swedish words

def do_stemming(text, stemmer):
    return [stemmer.stem(w) for w in text]

In [None]:
# Tokenize text, lowercase conversion of tokens and stop-word removal

def remove_stop_words(text, stop_list):
    text = text.lower()
    word_tokens = word_tokenize(text)
    clean = [w for w in word_tokens if not w.lower() in stop_list]
    return clean

In [None]:
# Perform stemming and stop-word removal

def preprocess(text, stop_list, stemmer):
    clean = remove_stop_words(text, stop_list)
    stemmed = do_stemming(clean, stemmer)
    return ' '.join(stemmed)

### Perform preprocessing functions:

In [None]:
data['text'] = [preprocess(text, stop_list = stop_words, stemmer = stemmer) for text in data['text']]

### Perform feature extraction using Tf-idf

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['text'].values)

# Sub-problem 1: Full-time vs Part-time

### Transform target values (True/False) into numerical values:

In [None]:
# Transforming True and False values into numerical using LabelEncoder

y_job = [str(i) for i in data['full_time_required'] ]

le_job = LabelEncoder()
le_job.fit(y_job)
y_job = le_job.transform(y_job)

### Split data into training and testing:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_job, test_size = 0.20, stratify = y_job, random_state = 42)

### Training:

In [None]:
# Select hyperparameter values for GridSearchCV
parameters = {'C': [0.1, 1, 10],
              'gamma': [0.1, 0.01, 0.001],
              'kernel': ['rbf', 'poly', 'sigmoid']}

# Initialize SVM classifier and perform hyperparameter tuning using GridSearchCV
clf = SVC()
clf_job = GridSearchCV(clf, parameters)

# Fit classifier after hyperparameter tuning and train 
clf_job.fit(X_train, y_train)

# Display best selection of hyperparameters after tuning
clf_job.best_params_

### Testing:

In [None]:
# Test and predict based on the test set

y_hat = clf_job.predict(X_test)

In [None]:
# Calculate accuracy, precision and recall on test set

accuracy = accuracy_score(y_test, y_hat)
precision = precision_score(y_test, y_hat, pos_label = "True")
recall = recall_score(y_test, y_hat, pos_label = "True")

In [None]:
# Print classification report

print(classification_report(y_test, y_hat))

# Sub-problem 2: Driving license vs No driving license

### Transform target values (True/False) into numerical values:

In [None]:
# Transforming True and False values into numerical using LabelEncoder

y_drv_lice = [str(i) for i in data['driving_license_required'] ]

le_drv_lice = LabelEncoder()
le_drv_lice.fit(y_drv_lice)
y_drv_lice = le_drv_lice.transform(y_drv_lice)

### Split data into training and testing:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_drv_lice, test_size=0.20, stratify=y_drv_lice, random_state=42)

### Training:

In [None]:
# Select hyperparameter values for GridSearchCV
parameters = {'C': [0.1, 1, 10],
              'gamma': [0.1, 0.01, 0.001],
              'kernel': ['rbf', 'poly', 'sigmoid']}

# Initialize SVM classifier and perform hyperparameter tuning using GridSearchCV
clf = SVC()
clf_drv_lice = GridSearchCV(clf, parameters)

# Fit classifier after hyperparameter tuning and train 
clf_drv_lice.fit(X_train, y_train)

# Display best selection of hyperparameters after tuning
clf_drv_lice.best_params_

### Testing:

In [None]:
# Test and predict based on the test set

y_hat = clf_drv_lice.predict(X_test)

In [None]:
# Calculate accuracy, precision and recall on test set

accuracy = accuracy_score(y_test, y_hat)
precision = precision_score(y_test, y_hat, pos_label = "True")
recall = recall_score(y_test, y_hat, pos_label = "True")

In [None]:
# Print classification report

print(classification_report(y_test,  y_hat))

# Sub-problem 3: Experience vs No experience

### Transform target values (True/False) into numerical values:

In [None]:
# Transforming True and False values into numerical using LabelEncoder

y_exp = [str(i) for i in data['experience_required'] ]

le_exp = LabelEncoder()
le_exp.fit(y_exp)
y_exp = le_exp.transform(y_exp)

### Split data into training and testing:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_exp, test_size=0.20, random_state=42)

### Training:

In [None]:
# Select hyperparameter values for GridSearchCV

parameters = {'C': [0.1, 1, 10],
              'gamma': [0.1, 0.01, 0.001],
              'kernel': ['rbf', 'poly', 'sigmoid']}

# Initialize SVM classifier and perform hyperparameter tuning using GridSearchCV
clf = SVC()
clf_exp = GridSearchCV(clf, parameters)

# Fit classifier after hyperparameter tuning and train 
clf_exp.fit(X_train, y_train)

# Display best selection of hyperparameters after tuning
clf_exp.best_params_

### Testing:

In [None]:
# Test and predict based on the test set

y_hat = clf_exp.predict(X_test)

In [None]:
# Calculate accuracy, precision and recall on test set

accuracy = accuracy_score(y_test, y_hat)
precision = precision_score(y_test, y_hat, pos_label = "True")
recall = recall_score(y_test, y_hat, pos_label = "True")

In [None]:
# Print classification report

print(classification_report(y_test, y_hat))

# Sub-problem 4: Education vs No education

### Transform target values (True/False) into numerical values:

In [None]:
# Transforming True and False values into numerical using LabelEncoder

y_edu = [str(i) for i in data['education_required'] ]

le_edu = LabelEncoder()
le_edu.fit(y_edu)
y_edu = le_edu.transform(y_edu)

### Split data into training and testing:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_edu, test_size=0.20, random_state=42)

### Training:

In [None]:
# Select hyperparameter values for GridSearchCV

parameters = {'C': [0.1, 1, 10],
              'gamma': [0.1, 0.01, 0.001],
              'kernel': ['rbf', 'poly', 'sigmoid']}

# Initialize SVM classifier and perform hyperparameter tuning using GridSearchCV
clf = SVC()
clf_edu = GridSearchCV(clf, parameters)

# Fit classifier after hyperparameter tuning and train 
clf_edu.fit(X_train,y_train)

# Display best selection of hyperparameters after tuning
clf_edu.best_params_

### Testing:

In [None]:
# Test and predict based on the test set

y_hat = clf_edu.predict(X_test)

In [None]:
# Calculate accuracy, precision and recall on test set

accuracy = accuracy_score(y_test, y_hat)
precision = precision_score(y_test, y_hat, pos_label = "True")
recall = recall_score(y_test, y_hat, pos_label = "True")

In [None]:
# Print classification report

print(classification_report(y_test, y_hat))