In [1]:
from sklearn.datasets import make_blobs
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
# import math lib
from math import pi

# import Qiskit
from qiskit import Aer, execute#aer是模拟器
from qiskit import QuantumCircuit, ClassicalRegister, QuantumRegister

# import basic plot tools
from qiskit.tools.visualization import plot_histogram

In [3]:
backend = Aer.get_backend('aer_simulator')

In [4]:
def get_theta(d):
    i = 0
    theta = 0
    count = 0
    for i in d:
        count = i+count
   # print(d)
    theta = 2*math.acos(count/384)
   # print(theta)
    return theta

In [5]:
def get_Distance(x,y):
    theta_1 = get_theta(x)
    theta_2 = get_theta(y)
    
    qr = QuantumRegister(3, name="qr")
    cr = ClassicalRegister(1, name="cr")
    qc = QuantumCircuit(qr, cr, name="k_means")
    
    qc.h(qr[0])
    qc.h(qr[1])
    qc.h(qr[2])
    qc.u(theta_1, pi, pi, qr[1])
    qc.u(theta_2, pi, pi, qr[2])
    qc.cswap(qr[0], qr[1], qr[2])
    qc.h(qr[0])

    qc.measure(qr[0], cr[0])
    qc.reset(qr)

    job = execute(qc,backend=backend, shots=1024)
    result = job.result()
    data = result.data()['counts']
    
    if len(data)==1:
        return 0.0
    else:
        return data['0x1']/1024.0

In [6]:
def get_data(dataset_name,model_name,data_num):
    import datasets
    dataset = datasets.load_from_disk(dataset_name)
    from sentence_transformers import SentenceTransformer 
    model = SentenceTransformer(model_name,cache_folder=r"D:\HF-model\all-MiniLM-L6-v2")
    from random import sample 
    import random
    n = random.randint(1,999999)  
    random.seed(n)
    sentences = sample(dataset['test'][0]['sentences'],data_num) 
    random.seed(n) 
    labels =  sample(dataset['test'][0]['labels'],data_num)
    embeddings = model.encode(sentences)#
    points=embeddings
    #初始化聚类中心
    np.random.seed(96)
    centroids = np.random.random([32,384])
    return points,centroids,sentences,labels

In [7]:
def get_distance(p1, p2):
    return np.sqrt(np.sum((p1-p2)*(p1-p2)))

In [8]:
def find_nearest_neighbour(points,centroids):
    
    n = len(points)
    k = centroids.shape[0]
    centers = np.zeros(n)
    
    for i in range(n):
        min_dis = 10000
        ind = 0
        for j in range(k):
            temp_dis = get_Distance(points[i,:],centroids[j,:])
            
            if temp_dis < min_dis:
                min_dis = temp_dis
                ind = j
        centers[i] = ind
    
    return centers

In [9]:
def find_centroids(points,centers):
    n = len(points)
    k = int(np.max(centers))+1   
    centroids = np.zeros([k,384])
    for i in range(k):
        centroids[i,:] = np.average(points[centers==i])
    return centroids

In [10]:
def preprocess(points):
    from sklearn.preprocessing import MinMaxScaler 
    scaler = MinMaxScaler() 
    normalized_data = scaler.fit_transform(points)
    return points

In [11]:
"""
可选数据集：
'D:\HF dataset\mteb/arxiv-clustering-p2p'
'D:\HF dataset\mteb/arxiv-clustering-s2s'
'D:\HF dataset\mteb/biorxiv-clustering-p2p'
'D:\HF dataset\mteb/biorxiv-clustering-s2s'
'D:\HF dataset\mteb/medrxiv-clustering-s2s'
'D:\HF dataset\mteb/reddit-clustering'
'D:\HF dataset\mteb/reddit-clustering-p2p'
'D:\HF dataset\mteb/stackExchange-clustering'
'D:\HF dataset\mteb/stackExchange-clustering-p2p'
'D:\HF dataset\mteb/twentynewsgroups-clustering'
"""

"\n可选数据集：\n'D:\\HF dataset\\mteb/arxiv-clustering-p2p'\n'D:\\HF dataset\\mteb/arxiv-clustering-s2s'\n'D:\\HF dataset\\mteb/biorxiv-clustering-p2p'\n'D:\\HF dataset\\mteb/biorxiv-clustering-s2s'\n'D:\\HF dataset\\mteb/medrxiv-clustering-s2s'\n'D:\\HF dataset\\mteb/reddit-clustering'\n'D:\\HF dataset\\mteb/reddit-clustering-p2p'\n'D:\\HF dataset\\mteb/stackExchange-clustering'\n'D:\\HF dataset\\mteb/stackExchange-clustering-p2p'\n'D:\\HF dataset\\mteb/twentynewsgroups-clustering'\n"

In [12]:
#range中的t1：重复实验次数（每一遍都是同样大小数据集，但是选择的数据不同）
#t2 为QKmeans算法中迭代次数
t1 = 3
t2 = 3
for qq in range(1):
    output = np.array([['v_measure','标签种类(k)']])
        #选择数据集的模块
    if qq==0:
        dataset_name = 'D:\HF dataset\mteb/biorxiv-clustering-p2p'
    if qq==1:
        dataset_name = 'D:\HF dataset\mteb/biorxiv-clustering-s2s'
    if qq==2:
        dataset_name = 'D:\HF dataset\mteb/medrxiv-clustering-s2s'
    if qq==3:
        dataset_name = 'D:\HF dataset\mteb/reddit-clustering'
    if qq==4:
        dataset_name = 'D:\HF dataset\mteb/reddit-clustering-p2p'
    if qq==5:
        dataset_name = 'D:\HF dataset\mteb/stackExchange-clustering'
    if qq==6:
        dataset_name = 'D:\HF dataset\mteb/stackExchange-clustering-p2p'
    if qq==7:
        dataset_name = 'D:\HF dataset\mteb/twentynewsgroups-clustering'
    if qq==8:
        dataset_name = 'D:\HF dataset\mteb/arxiv-clustering-p2p'
    if qq==9:
        dataset_name = 'D:\HF dataset\mteb/arxiv-clustering-s2s'    
    #QKmeans
    for n1 in range(t1):

        #设置超参数
        model_name = 'all-MiniLM-L6-v2'
        data_num = 100
    
        points,centroids1,sentences,labels = get_data(dataset_name,model_name,data_num)       #dataset

        points = preprocess(points)                # Normalize dataset
        # run k-means algorithm
        for i1 in range(t2):
            centers = find_nearest_neighbour(points,centroids1)       # find nearest centers
            centroids = find_centroids(points,centers)               # find centroids
        #计算这组数据集中有多少不同种标签
        k=[0]*len(labels)
        count = 0
        flag1 = 0
        for i in labels: 
            flag1 = 0 
            for j in k: 
                if j == i: 
                    flag1 = 0
                    break 
                else:  
                    flag1 = 1
            if flag1 == 1:
                for n in range(len(labels)):
                    if k[n] == 0:
                        k[n] = i
                        break
                    else:
                        continue
        for m in k:
            if m == 0:
                break
            else:
                count = count + 1   
    
        #评价部分
        from mteb.evaluation.evaluators import ClusteringEvaluator
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer('all-MiniLM-L6-v2',cache_folder=r"D:\HF-model\all-MiniLM-L6-v2")
        labels = centers
        clusterer = ClusteringEvaluator(sentences=sentences, labels=labels)
        #result是dic类型
        result = clusterer(model)
        output = np.insert(output,n1+1,[str(result['v_measure']),str(count)],axis=0)
    output = np.insert(output,0,[dataset_name[19:],str(data_num)],axis=0)


    filename = '%s组重复 %s %s.csv'%(t1,dataset_name[19:],model_name)
    np.savetxt(filename,output,fmt='%s',delimiter=',')

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
