In [None]:
import torch
from ClaD_misogyny import ClaDModel
import jsonlines
from tqdm import tqdm
import numpy as np
from transformers import XLNetTokenizer

NAME='misogyny'

DEV = './simcse-datasets/'+NAME+'/dev.txt'
TEST = './simcse-datasets/'+NAME+'/test.txt'

model = ClaDModel(pretrained_model='xlnet-base-cased', pooling='cls')
state_dict = torch.load('./saved_model_'+NAME+'/simcse_sup_xlnet.pt', map_location=torch.device('cpu'))
state_dict.pop("bert.embeddings.position_ids", None)  
model.load_state_dict(state_dict) 
model.eval()
# Initialize the tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
sentence = "This is an example sentence."    
def convert_to_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512, padding=True)

    with torch.no_grad():
        outputs,_ = model(**inputs)
        outputs_array=outputs.numpy()[0]
        return outputs_array
        
print(convert_to_embedding(sentence))
###train embeddings
src_path='./simcse-datasets/'+NAME+'/train.txt'

def get_positive_embeddings(path,name):
    positive=[]
    with jsonlines.open(path, 'r') as reader:
        for line in tqdm(reader):
            sent1 = line.get(name+'1')
            sent2 = line.get(name+'2')
            positive.append(sent1)
            positive.append(sent2)
    positive=list(set(positive))
    positive_embeddings=[]
    for p in positive:
    
        positive_embeddings.append(convert_to_embedding(p))

    positive_embeddings=np.array(positive_embeddings)
    print(positive_embeddings.shape)
    return positive_embeddings


positive_embeddings=get_positive_embeddings(src_path,NAME)
mean=np.mean(positive_embeddings,axis=0)
std=np.std(positive_embeddings,axis=0)
normalized_data = (positive_embeddings - mean) / std
covariance_matrix = np.cov(normalized_data, rowvar=False)




In [None]:
import jsonlines
from tqdm import tqdm


test_path='./simcse-datasets/'+NAME+'/test.txt'
names=['source','content','label']
def load_test_data(path,names):
    pred_test=[]
    real=[]
    origin=[]
    comment=[]
    with jsonlines.open(path, 'r') as reader:
        for line in tqdm(reader):
            sent1 = line.get(names[0])
            sent2 = line.get(names[1])
            label = line.get(names[2])
            if sent2 not in comment:
                real.append(label)
                origin.append(sent1)
                comment.append(sent2)
    return comment,real
comment,real = load_test_data(test_path,names)
len(comment)

In [None]:
import scipy.stats as stats
import numpy as np

def cal_mah(new_data, data_points):
    n = data_points.shape[0]
    d = data_points.shape[1]
    data = np.vstack((new_data, data_points))
    mean = np.mean(data, axis=0)
    covariance_matrix = np.cov(data, rowvar=False)

    difference_vector = (new_data - mean)
    inverse_covariance_matrix = np.linalg.pinv(covariance_matrix)
    squared_distance = np.dot(np.dot(difference_vector.T, inverse_covariance_matrix), difference_vector)
    return squared_distance

def ag1(squared_distance, data_points,alpha):
    n = data_points.shape[0]
    d = data_points.shape[1]
    print(n,d)
    T = (n+1)/(n**2) * squared_distance

    p = d/2  # Shape parameter alpha of Beta distribution
    q = (n-d)/2  # Shape parameter beta of Beta distribution
    # Significance level alpha
    # Compute critical value for the upper tail (e.g., 95th percentile)
    critical_value_upper = stats.beta.ppf(1 - alpha, p, q)
    # print(critical_value_upper)
    # print(T)
    # print('===========')

    if T > critical_value_upper:
        return 0  # Abnormal (above the critical value)
    else:
        return 1  # Normal




In [None]:
Squared_distance=[]
data = positive_embeddings
for i in range(len(comment)):
    arr=convert_to_embedding(comment[i])
    
    s=cal_mah(arr,data)
    Squared_distance.append(s)
    

In [None]:
import matplotlib.pyplot as plt
import numpy as np


categories = np.array(real)
lengths = np.array(Squared_distance)
plt.figure(figsize=(12, 6))
colors = ['blue' if x == 0 else 'red' for x in categories]  
plt.scatter(range(len(lengths)), lengths, c=colors, alpha=0.6)  

plt.title('')
plt.xlabel('')
plt.ylabel('Squared Mahalanobis Distance')
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

alpha=[1e-16]
results=[]
for a in alpha:
    pred_using_ag1=[]
    for i in range(len(Squared_distance)):
        
        s=Squared_distance[i]
        
        result=ag1(s,data,a)
        
        pred_using_ag1.append(result)
    y_true=real
    y_pred=pred_using_ag1
    # 
    accuracy = accuracy_score(y_true, y_pred)

    # 
    precision = precision_score(y_true, y_pred)

    # 
    recall = recall_score(y_true, y_pred)

    # 
    f1 = f1_score(y_true, y_pred)
    results.append([accuracy,precision,recall,f1])

    
    

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

fpr = fp / (fp + tn)

print(f'False Positive Rate: {fpr}')
