In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import classification_report
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec, Phrases
pd.options.display.float_format = '{:,.4f}'.format
seed = 42
np.random.seed(seed)

In [2]:
df = pd.read_csv("task-3-dataset.csv") # Считываем данные

In [3]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer(lang='ru')
# Функция загрузки стопслов
def downloads_():
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords

# Функция обработки текта
def foo(review):
    # Обработка текста отзыва. Оставляем только буквы, приводим к нижнему регистру
    review = review.lower()
    review = review.replace('ё','е')
    review = re.sub('\[[^]]*\]', ' ', review)
    review = re.sub('[^а-я]', ' ', review)
    # Отделяем слова
    review = review.split()
    # Избавляемся от стоп-слов(предлоги,союзы, частицы, не несущие семантической нагрузки)
    review = [word for word in review if not word in set(stopwords.words('russian'))]
    # Лемматизируем(для русского языка в явном виде нет, но пока тк)
    # В явном виде лемматизации нет для русского языка,  SNOWBALL STEMMER как вариант
    lemmatized_words = [morph.normal_forms(word)[0] for word in review]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

In [4]:
# Разделим данные, пока нет тестовых
from sklearn.model_selection import train_test_split
df["разметка"].loc[df["разметка"]=="+"]=1
df["разметка"].loc[df["разметка"]=="-"]=0
df["отзывы"] = df["отзывы"].apply(foo)
#dataset_train, dataset_test, train_data_label, test_data_label = train_test_split(df['отзывы'], df['разметка'], test_size=0.2, random_state=42)

In [5]:
#Tokenization of text
tokenizer=ToktokTokenizer()

In [None]:
downloads_()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fayne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(min_df=0.0,max_df=1.0,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(df['отзывы'])
#transformed test reviews
print('Tfidf_train:',tv_train_reviews.shape)

Tfidf_train: (210, 3884)


In [None]:
# Применим метод главных компонент для того, чтобы сократить вектор TF-IDF
from sklearn.decomposition import PCA
def reduce_dimensionality(X, n_components=4): # число кубит
    pca = PCA(n_components=n_components)
    return pca.fit_transform(X)

In [12]:
# Обрежем данные
X = tv_train_reviews.copy()
y = df['разметка']
X_reduced = reduce_dimensionality(X)
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2)

In [None]:
import qiskit
from qiskit_aer import Aer
from qiskit.circuit import QuantumCircuit, Parameter
from qiskit.primitives import Sampler, Estimator
#from qiskit.algorithms.optimizers import COBYLA
from qiskit_machine_learning.neural_networks import SamplerQNN, EstimatorQNN
from qiskit_machine_learning.connectors import TorchConnector
from qiskit_machine_learning.circuit.library import QNNCircuit



In [62]:
# Создадим квантовую схему
n_qubits = 4 # Число кубит
circuit = QuantumCircuit(n_qubits)
# Параметры для схемы
params = [Parameter(f'theta_{i}') for i in range(n_qubits * 2)]
# Кодирование данных
for i in range(n_qubits):
    circuit.ry(params[i], i)
# Параметризированные вращения
for i in range(n_qubits - 1):
    circuit.cx(i, i + 1)
# Добавляем измерения
circuit.measure_all()
# Рисуем схему в текстовом формате
print(circuit.draw(output='text'))

        ┌─────────────┐                ░ ┌─┐         
   q_0: ┤ Ry(theta_0) ├──■─────────────░─┤M├─────────
        ├─────────────┤┌─┴─┐           ░ └╥┘┌─┐      
   q_1: ┤ Ry(theta_1) ├┤ X ├──■────────░──╫─┤M├──────
        ├─────────────┤└───┘┌─┴─┐      ░  ║ └╥┘┌─┐   
   q_2: ┤ Ry(theta_2) ├─────┤ X ├──■───░──╫──╫─┤M├───
        ├─────────────┤     └───┘┌─┴─┐ ░  ║  ║ └╥┘┌─┐
   q_3: ┤ Ry(theta_3) ├──────────┤ X ├─░──╫──╫──╫─┤M├
        └─────────────┘          └───┘ ░  ║  ║  ║ └╥┘
meas: 4/══════════════════════════════════╩══╩══╩══╩═
                                          0  1  2  3 


In [64]:
from qiskit.circuit.library import RealAmplitudes, ZZFeatureMap

num_inputs = 4
feature_map = ZZFeatureMap(num_inputs)
ansatz = RealAmplitudes(num_inputs,reps=1)

circuit = QuantumCircuit(num_inputs)
circuit.compose(feature_map, inplace=True)
circuit.compose(ansatz, inplace=True)

def parity(x):
    return "{:b}".format(x).count("1") % 2

In [65]:
from qiskit_machine_learning.neural_networks import SamplerQNN
from qiskit.primitives import Sampler

sampler = Sampler()
sampler_qnn = SamplerQNN(
    circuit=circuit,
    input_params=feature_map.parameters,
    weight_params=ansatz.parameters,
    interpret=parity,
    output_shape=2,
    sampler=sampler,
)

  sampler = Sampler()
  sampler_qnn = SamplerQNN(


In [None]:
from qiskit_machine_learning.algorithms import NeuralNetworkClassifier
classifier = NeuralNetworkClassifier(
    neural_network=sampler_qnn,
    loss="cross_entropy",
    one_hot=True,
)
classifier.fit(np.asarray(X_train), np.asarray(y_train))