This codebook extracts LIWC embeddings and trains a simple classifier to predict moral sentiment  
The parameter estimation/tuning can take some time on slower machines (~30min)

## Load Packages

In [264]:
import pandas as pd
import numpy as np
import pickle as pkl
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC

import subprocess

foundations = {"mfrc":  {
                    "complete": ["care", "harm", "equality", "proportionality", "loyalty", "betrayal", "authority", "subversion", "purity", "degradation", "thin morality", "non-moral"],
                    "binding": ["individual", "binding", "proportionality", "thin morality", "non-moral"], 
                    "moral": ["moral", "thin morality", "non-moral"],
                    "full": ["care", "proportionality", "loyalty", "authority", "purity", "equality", "thin morality", "non-moral"]
               }
              }

## Functions for training

In [267]:
def create_liwc(input_file, output_file):

    cmd_to_execute = ["LIWC-22-cli",
                  "--mode", "wc",
                  "--input", input_file,
                  "--column-indices", "1",
                  "--output", output_file]

    subprocess.call(cmd_to_execute)

def train(mode, input_file, label_path, params, model):
    
    X = pd.read_csv(input_file, index_col=0).drop(["Segment"], axis=1) # load liwc vectors
    Y = pd.read_csv(label_path).loc[:, foundations["mfrc"]["full"]] # extract labels 

    # loop over N classes and fit classifier for each
    for i in range(Y.shape[1]):
        c = foundations["mfrc"]["full"][i]
        y = Y.iloc[:, i]
        print("Start training: " + c)
        model.set_params(**params[i]).fit(X, y)
        print("Saving the model")
        pkl.dump(model, open("../models/liwc_" + c + "_" + mode + ".sav", 'wb'))

def crossVal(mode, input_file, label_path, model):
       
    df_train = pd.read_csv(input_file) # load liwc vectors and y
    df_labels = pd.read_csv(label_path)
    X = pd.read_csv(input_file, index_col=0).drop(["Segment"], axis=1) # load liwc vectors
    Y = pd.read_csv(label_path).loc[:, foundations["mfrc"]["full"]] # extract labels 

    # loop over N classes and fit classifier for each
    # switch to gridsearch + cv
    params = []
    macro_score = []
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params_SVC = {'C': [10**x for x in range(-3,4)], # parameter tuning can be improved with more sophisticated methods (here only find regularization)
                  'random_state': [0],
                  "gamma": ["scale", "auto"],
                }

    for i in range(Y.shape[1]):
        c = Y.columns[i]
        y = Y.iloc[:, i]
        clf = GridSearchCV(model, params_SVC, cv=cv, scoring="f1_macro")
        clf.fit(X, y)
        
        best_mean_metric = clf.cv_results_['mean_test_score'][clf.best_index_]
        best_std_metric = clf.cv_results_['std_test_score'][clf.best_index_]
        macro_score.append(best_mean_metric)
        print("Average CV metric: %.2f" % (best_mean_metric*100))
        print("Standard deviation of CV metric: %.2f" % (best_std_metric))  
        params.append(clf.best_params_) #save params for each label
            
    print("%.2f%% (+/- %.2f%%)" % (np.mean(macro_score), np.std(macro_score)))
    return params

## General Parameters

In [262]:
# choose MFRC as corpus (can be changed to run on other corpora as necessary)
# choose to run on full MFT dimensions (see prepare_data for different ways of categorizing the moral values)
# Choose between training=eval for determining train/validation accuracy (e.g., when optimizing parameters) and training=normal to train the model

corp = "mfrc"
mode = "full"
training = "normal"

# set location of training files (input features and labels)
label_file = "../data/preprocessed/mfrc_train_full.csv"
liwc_file = "../data/preprocessed/mfrc_train_" + mode +  "_liwc.csv"

## Data Creation

In [263]:
# create_liwc(input_file, ouput_file) # extract liwc features (or use LIWC client)

## Train/Eval

In [None]:
model = SVC() # replace with model of choice (e.g., SVC, logistic regression, neural network)
best_params = crossVal(mode, liwc_file, label_file, model) ### Find best parameters for a given model using CV

In [None]:
model = SVC() # replace with best performing model (e.g., logistic regression)
train(mode, liwc_file, raw_file, best_params, model)  ### train model using best parameters on all data