In [3]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import seaborn as sns
import sys
import numpy as np
from sentence_transformers import SentenceTransformer

import matplotlib.pyplot as plt

import warnings
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")


### Load Dataset

In [7]:
os.getcwd()
sys.argv[0]
dataset_dir = os.path.dirname(os.path.realpath('__file__'))+'/processed_datasets/'
train_dir = dataset_dir + '/train/'
test_dir = dataset_dir + '/test/'
valid_dir = dataset_dir + '/val/'

dataset_files = [train_dir + 'non_hri_data_train.csv',test_dir+ 'non_hri_data_test.csv', valid_dir + 'non_hri_data_val.csv']


vectorizer = TfidfVectorizer()

train_df = pd.read_csv(train_dir + 'non_hri_data_train.csv')
test_df = pd.read_csv(test_dir+ 'non_hri_data_test.csv')
valid_df = pd.read_csv(valid_dir + 'non_hri_data_val.csv')

hri_df = pd.read_csv(dataset_dir + '/hri_data_cues_revised_label.csv')

hri_df

Unnamed: 0,text,label,revised_label,valence,arousal,sentiment,speaker_utterance,predictions_ER,predictions_IP,predictions_EX
0,That sounds very interesting. And do you often...,0,0,0.406333,0.083333,positive,this is a dummy text,1,0,2
1,How long have you been together?,0,0,0.415000,-0.255000,neutral,this is a dummy text,0,0,2
2,how can they do that?,0,0,0.332000,-0.092000,negative,this is a dummy text,0,0,1
3,I hope that I can be there with you. Maybe can...,0,2,0.541500,-0.210500,positive,this is a dummy text,1,0,0
4,it's great to hear that who do you go there with?,0,2,0.485333,0.023333,positive,this is a dummy text,1,0,2
...,...,...,...,...,...,...,...,...,...,...
611,What do you listen to?,0,0,0.441000,-0.092000,neutral,this is a dummy text,0,0,2
612,"No, is it very famous?",0,0,0.832000,0.550000,neutral,this is a dummy text,0,0,2
613,My favorite subject at school is English. I li...,0,1,0.500154,-0.051231,positive,this is a dummy text,0,2,0
614,It sounds fun. What is your favorite thing to ...,0,2,0.545600,0.111200,positive,this is a dummy text,0,0,2


In [8]:


X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])
X_valid = vectorizer.transform(valid_df['text'])

X_hri = vectorizer.transform(hri_df['text'])


X_valid

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 62839 stored elements and shape (4882, 16673)>

## Random Forest

In [9]:
model_rf = RandomForestClassifier(n_estimators=100, min_samples_split=3,criterion='gini')
model_rf.fit(X_train, train_df['label'])

In [17]:
def get_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision = precision_score(actual, predicted, average='macro')
    recall = recall_score(actual, predicted, average='macro')
    f1 = f1_score(actual, predicted, average='macro')
    print("Accuracy: {:.2f}%, Precision: {:.2f}% Recall: {:.2f}%, F1: {:.2f}%".format(accuracy * 100, precision *100, recall * 100, f1 *100))


predictions = model_rf.predict(X_test)

predictions_hri = model_rf.predict(X_hri)

print('metrics non-hri')
get_metrics(predictions,test_df['label'])
print('metrics hri')
get_metrics(predictions_hri,hri_df['revised_label'] )






metrics non-hri
Accuracy: 70.32%, Precision: 64.96% Recall: 63.35%, F1: 63.55%
metrics hri
Accuracy: 46.75%, Precision: 60.02% Recall: 43.76%, F1: 38.43%


## XGBoost

In [None]:
model_xg = xgb.XGBClassifier(n_jobs=-1,max_depth = 10, booster = 'gbtree', device = 'cpu', n_estimators = 100, objective= 'binary:logistic')
model_xg.fit(X_train, train_df['label'], eval_set=[(X_valid,valid_df['label'])])


[0]	validation_0-mlogloss:0.97811
[1]	validation_0-mlogloss:0.90397
[2]	validation_0-mlogloss:0.85419
[3]	validation_0-mlogloss:0.81965
[4]	validation_0-mlogloss:0.79435
[5]	validation_0-mlogloss:0.77405
[6]	validation_0-mlogloss:0.75826
[7]	validation_0-mlogloss:0.74693
[8]	validation_0-mlogloss:0.73663
[9]	validation_0-mlogloss:0.72749
[10]	validation_0-mlogloss:0.72004
[11]	validation_0-mlogloss:0.71475
[12]	validation_0-mlogloss:0.70964
[13]	validation_0-mlogloss:0.70460
[14]	validation_0-mlogloss:0.70035
[15]	validation_0-mlogloss:0.69736
[16]	validation_0-mlogloss:0.69425
[17]	validation_0-mlogloss:0.69154
[18]	validation_0-mlogloss:0.68930
[19]	validation_0-mlogloss:0.68690
[20]	validation_0-mlogloss:0.68427
[21]	validation_0-mlogloss:0.68305
[22]	validation_0-mlogloss:0.68100
[23]	validation_0-mlogloss:0.67973
[24]	validation_0-mlogloss:0.67717
[25]	validation_0-mlogloss:0.67576
[26]	validation_0-mlogloss:0.67395
[27]	validation_0-mlogloss:0.67228
[28]	validation_0-mlogloss:0.6

In [None]:
predictions = model_xg.predict(X_test)
predictions_hri = model_xg.predict(X_hri)

print('metrics non-hri')
get_metrics(predictions,test_df['label'])
print('metrics hri')
get_metrics(predictions_hri,hri_df['revised_label'] )

NameError: name 'model_xg' is not defined