In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import classification_report
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec, Phrases
pd.options.display.float_format = '{:,.4f}'.format
seed = 42
np.random.seed(seed)

In [2]:
df = pd.read_csv("task-3-dataset.csv") # Считываем данные

In [3]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer(lang='ru')
# Функция загрузки стопслов
def downloads_():
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords

# Функция обработки текта
def foo(review):
    # Обработка текста отзыва. Оставляем только буквы, приводим к нижнему регистру
    review = review.lower()
    review = review.replace('ё','е')
    review = re.sub('\[[^]]*\]', ' ', review)
    review = re.sub('[^а-я]', ' ', review)
    # Отделяем слова
    review = review.split()
    # Избавляемся от стоп-слов(предлоги,союзы, частицы, не несущие семантической нагрузки)
    review = [word for word in review if not word in set(stopwords.words('russian'))]
    # Лемматизируем(для русского языка в явном виде нет, но пока тк)
    # В явном виде лемматизации нет для русского языка,  SNOWBALL STEMMER как вариант
    lemmatized_words = [morph.normal_forms(word)[0] for word in review]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

In [4]:
# Разделим данные, пока нет тестовых
from sklearn.model_selection import train_test_split
df["разметка"].loc[df["разметка"]=="+"]=1
df["разметка"].loc[df["разметка"]=="-"]=0
df["отзывы"] = df["отзывы"].apply(foo)
#dataset_train, dataset_test, train_data_label, test_data_label = train_test_split(df['отзывы'], df['разметка'], test_size=0.2, random_state=42)

In [5]:
#Tokenization of text
tokenizer=ToktokTokenizer()

In [6]:
downloads_()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fayne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(min_df=0.0,max_df=1.0,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(df['отзывы'])
#transformed test reviews
print('Tfidf_train:',tv_train_reviews.shape)

Tfidf_train: (210, 3884)


In [8]:
# Применим метод главных компонент для того, чтобы сократить вектор TF-IDF
from sklearn.decomposition import PCA
def reduce_dimensionality(X, n_components=16): # число кубит
    pca = PCA(n_components=n_components)
    return pca.fit_transform(X)

In [9]:
# Обрежем данные
X = tv_train_reviews.copy()
y = df['разметка']
X_reduced = reduce_dimensionality(X)
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2)

In [10]:
import qiskit
from qiskit_aer import Aer
from qiskit.circuit import QuantumCircuit, Parameter
from qiskit.primitives import Sampler, Estimator
#from qiskit.algorithms.optimizers import COBYLA
from qiskit_machine_learning.neural_networks import SamplerQNN, EstimatorQNN
from qiskit_machine_learning.connectors import TorchConnector
from qiskit_machine_learning.circuit.library import QNNCircuit



In [11]:
# Создадим квантовую схему
n_qubits = 4 # Число кубит
circuit = QuantumCircuit(n_qubits)
# Параметры для схемы
params = [Parameter(f'theta_{i}') for i in range(n_qubits * 2)]
# Кодирование данных
for i in range(n_qubits):
    circuit.ry(params[i], i)
# Параметризированные вращения
for i in range(n_qubits - 1):
    circuit.cx(i, i + 1)
# Добавляем измерения
circuit.measure_all()
# Рисуем схему в текстовом формате
print(circuit.draw(output='text'))

        ┌─────────────┐                ░ ┌─┐         
   q_0: ┤ Ry(theta_0) ├──■─────────────░─┤M├─────────
        ├─────────────┤┌─┴─┐           ░ └╥┘┌─┐      
   q_1: ┤ Ry(theta_1) ├┤ X ├──■────────░──╫─┤M├──────
        ├─────────────┤└───┘┌─┴─┐      ░  ║ └╥┘┌─┐   
   q_2: ┤ Ry(theta_2) ├─────┤ X ├──■───░──╫──╫─┤M├───
        ├─────────────┤     └───┘┌─┴─┐ ░  ║  ║ └╥┘┌─┐
   q_3: ┤ Ry(theta_3) ├──────────┤ X ├─░──╫──╫──╫─┤M├
        └─────────────┘          └───┘ ░  ║  ║  ║ └╥┘
meas: 4/══════════════════════════════════╩══╩══╩══╩═
                                          0  1  2  3 


In [12]:
from qiskit.circuit.library import RealAmplitudes, ZZFeatureMap

num_inputs = 4
feature_map = ZZFeatureMap(num_inputs)
ansatz = RealAmplitudes(num_inputs,reps=1)

circuit = QuantumCircuit(num_inputs)
circuit.compose(feature_map, inplace=True)
circuit.compose(ansatz, inplace=True)

def parity(x):
    return "{:b}".format(x).count("1") % 2

In [13]:
from qiskit_machine_learning.neural_networks import SamplerQNN
from qiskit.primitives import Sampler

sampler = Sampler()
sampler_qnn = SamplerQNN(
    circuit=circuit,
    input_params=feature_map.parameters,
    weight_params=ansatz.parameters,
    interpret=parity,
    output_shape=2,
    sampler=sampler,
)

  sampler = Sampler()
  sampler_qnn = SamplerQNN(


In [14]:
from qiskit_machine_learning.algorithms import NeuralNetworkClassifier
classifier = NeuralNetworkClassifier(
    neural_network=sampler_qnn,
    loss="cross_entropy",
    one_hot=True,
)
classifier.fit(np.asarray(X_train), np.asarray(y_train))

QiskitMachineLearningError: 'Input data has incorrect shape, last dimension is not equal to the number of inputs: 4, but got: 16.'

In [None]:
y_pred = classifier.predict(X_test)
y_pred = [int(i) for i in y_pred]
y_true = [int(i) for i in y_test]

NameError: name 'classifier' is not defined

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Вывод точности
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Вывод precision, recall и f1-score
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

NameError: name 'y_true' is not defined

# Pennylane

In [15]:
# Обрежем данные
X = tv_train_reviews.copy()
y = df['разметка']
X_reduced = reduce_dimensionality(X)
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2)

In [16]:
import pennylane as qml
import torch
import numpy as np
from torch.autograd import Variable
import torch.optim as optim

np.random.seed(0)
torch.manual_seed(0)

num_classes = 2
margin = 0.15
feature_size = 4
batch_size = 10
lr_adam = 0.01
train_split = 0.75
# the number of the required qubits is calculated from the number of features
num_qubits = int(np.ceil(np.log2(feature_size)))
num_layers = 6
total_iterations = 100

dev = qml.device("default.qubit", wires=num_qubits)

In [17]:
def layer(W):
    for i in range(num_qubits):
        qml.Rot(W[i, 0], W[i, 1], W[i, 2], wires=i)
    for j in range(num_qubits - 1):
        qml.CNOT(wires=[j, j + 1])
    if num_qubits >= 2:
        # Apply additional CNOT to entangle the last with the first qubit
        qml.CNOT(wires=[num_qubits - 1, 0])

In [18]:
def circuit(weights, feat=None):
    qml.AmplitudeEmbedding(feat, range(num_qubits), pad_with=0.0, normalize=True)

    for W in weights:
        layer(W)

    return qml.expval(qml.PauliZ(0))


qnodes = []
for iq in range(num_classes):
    qnode = qml.QNode(circuit, dev, interface="torch")
    qnodes.append(qnode)

In [19]:
def variational_classifier(q_circuit, params, feat):
    weights = params[0]
    bias = params[1]
    return q_circuit(weights, feat=feat) + bias

In [20]:
def multiclass_svm_loss(q_circuits, all_params, feature_vecs, true_labels):
    loss = 0
    num_samples = len(true_labels)
    for i, feature_vec in enumerate(feature_vecs):
        # Compute the score given to this sample by the classifier corresponding to the
        # true label. So for a true label of 1, get the score computed by classifer 1,
        # which distinguishes between "class 1" or "not class 1".
        s_true = variational_classifier(
            q_circuits[int(true_labels[i])],
            (all_params[0][int(true_labels[i])], all_params[1][int(true_labels[i])]),
            feature_vec,
        )
        s_true = s_true.float()
        li = 0

        # Get the scores computed for this sample by the other classifiers
        for j in range(num_classes):
            if j != int(true_labels[i]):
                s_j = variational_classifier(
                    q_circuits[j], (all_params[0][j], all_params[1][j]), feature_vec
                )
                s_j = s_j.float()
                li += torch.max(torch.zeros(1).float(), s_j - s_true + margin)
        loss += li

    return loss / num_samples

In [21]:
def classify(q_circuits, all_params, feature_vecs, labels):
    predicted_labels = []
    for i, feature_vec in enumerate(feature_vecs):
        scores = np.zeros(num_classes)
        for c in range(num_classes):
            score = variational_classifier(
                q_circuits[c], (all_params[0][c], all_params[1][c]), feature_vec
            )
            scores[c] = float(score)
        pred_class = np.argmax(scores)
        predicted_labels.append(pred_class)
    return predicted_labels

In [22]:
def accuracy(labels, hard_predictions):
    loss = 0
    for l, p in zip(labels, hard_predictions):
        if torch.abs(l - p) < 1e-5:
            loss = loss + 1
    loss = loss / labels.shape[0]
    return loss

In [23]:
def training(features, Y):
    num_data = Y.shape[0]
    feat_vecs_train, feat_vecs_test, Y_train, Y_test = X_train, X_test, y_train, y_test
    num_train = Y_train.shape[0]
    q_circuits = qnodes

    # Initialize the parameters
    all_weights = [
        Variable(0.1 * torch.randn(num_layers, num_qubits, 3), requires_grad=True)
        for i in range(num_classes)
    ]
    all_bias = [Variable(0.1 * torch.ones(1), requires_grad=True) for i in range(num_classes)]
    optimizer = optim.Adam(all_weights + all_bias, lr=lr_adam)
    params = (all_weights, all_bias)
    print("Num params: ", 3 * num_layers * num_qubits * 3 + 3)

    costs, train_acc, test_acc = [], [], []

    # train the variational classifier
    for it in range(total_iterations):
        batch_index = np.random.randint(0, num_train, (batch_size,))
        feat_vecs_train_batch = feat_vecs_train[batch_index]
        Y_train_batch = Y_train[batch_index]

        optimizer.zero_grad()
        curr_cost = multiclass_svm_loss(q_circuits, params, feat_vecs_train_batch, Y_train_batch)
        curr_cost.backward()
        optimizer.step()

        # Compute predictions on train and validation set
        predictions_train = classify(q_circuits, params, feat_vecs_train, Y_train)
        predictions_test = classify(q_circuits, params, feat_vecs_test, Y_test)
        acc_train = accuracy(Y_train, predictions_train)
        acc_test = accuracy(Y_test, predictions_test)

        print(
            "Iter: {:5d} | Cost: {:0.7f} | Acc train: {:0.7f} | Acc test: {:0.7f} "
            "".format(it + 1, curr_cost.item(), acc_train, acc_test)
        )

        costs.append(curr_cost.item())
        train_acc.append(acc_train)
        test_acc.append(acc_test)

    return costs, train_acc, test_acc


# We now run our training algorithm and plot the results. Note that
# for plotting, the matplotlib library is required

features, Y = np.asarray([0,0]),np.asarray([0])
costs, train_acc, test_acc = training(features, Y)

import matplotlib.pyplot as plt

fig, ax1 = plt.subplots()
iters = np.arange(0, total_iterations, 1)
colors = ["tab:red", "tab:blue"]
ax1.set_xlabel("Iteration", fontsize=17)
ax1.set_ylabel("Cost", fontsize=17, color=colors[0])
ax1.plot(iters, costs, color=colors[0], linewidth=4)
ax1.tick_params(axis="y", labelsize=14, labelcolor=colors[0])

ax2 = ax1.twinx()
ax2.set_ylabel("Test Acc.", fontsize=17, color=colors[1])
ax2.plot(iters, test_acc, color=colors[1], linewidth=4)

ax2.tick_params(axis="x", labelsize=14)
ax2.tick_params(axis="y", labelsize=14, labelcolor=colors[1])

plt.grid(False)
plt.tight_layout()
plt.show()

Num params:  111


KeyError: '[87, 70] not in index'

Num params:  250


IndexError: too many indices for tensor of dimension 1