# Классическое решение

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import classification_report
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
pd.options.display.float_format = '{:,.4f}'.format
seed = 42
np.random.seed(seed)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import time
import re

from sklearn.metrics import classification_report

from bs4 import BeautifulSoup


from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Tools for creating ngrams and vectorizing input data
from gensim.models import Word2Vec, Phrases

# Configs
pd.options.display.float_format = '{:,.4f}'.format
sns.set(style="whitegrid")
seed = 42
np.random.seed(seed)

In [None]:
df = pd.read_csv("task-3-dataset.csv") # Считываем данные
df2 = pd.read_csv("test50.csv")

In [None]:
!pip install pymorphy3



In [None]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer()
# Функция загрузки стопслов
def downloads_():
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords
# Функция обработки текта
def foo(review, morph):
    # Обработка текста отзыва. Оставляем только буквы, приводим к нижнему регистру
    review = re.sub('\[[^]]*\]', ' ', review)
    review = re.sub('[^а-яА-Я]', ' ', review)
    review = review.lower()
    # Отделяем слова
    review = review.split()
    # Избавляемся от стоп-слов(предлоги,союзы, частицы, не несущие семантической нагрузки)
    review = [word for word in review if not word in set(stopwords.words('russian'))]
    # Лемматизируем(для русского языка в явном виде нет, но пока тк)
    # В явном виде лемматизации нет для русского языка,  SNOWBALL STEMMER как вариант
    lemmatized_words = [morph.normal_forms(word)[0] for word in review]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

In [None]:
# Разделим данные, пока нет тестовых
from sklearn.model_selection import train_test_split
# dataset_train, dataset_test, train_data_label, test_data_label = train_test_split(df['отзывы'], df['разметка'], test_size=0.2, random_state=42)
dataset_train = df['отзывы']
train_data_label = df['разметка']

dataset_test = df2["Отзывы"]
test_data_label = df2["разметка"]

train_data_label

Unnamed: 0,разметка
0,-
1,+
2,-
3,-
4,+
...,...
205,+
206,+
207,-
208,-


In [None]:
# Сформируем тестовый и трейновый словари
corpus_train = []
corpus_test  = []

downloads_()
for i in range(dataset_train.shape[0]):
    review = dataset_train.iloc[i]
    review = foo(review, morph)
    corpus_train.append(review)

for j in range(dataset_test.shape[0]):
    review = dataset_test.iloc[j]
    review = foo(review, morph)
    corpus_test.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Векторизуем с помощью TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer(ngram_range=(1, 3))

tfidf_vec_train = tfidf_vec.fit_transform(corpus_train)
tfidf_vec_test = tfidf_vec.transform(corpus_test)


In [None]:
# Обучаем
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(C=0.5, random_state=42)
linear_svc.fit(tfidf_vec_train, train_data_label)

predict = linear_svc.predict(tfidf_vec_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Classification Report: \n", classification_report(test_data_label, predict,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict))
print("Accuracy: \n", accuracy_score(test_data_label, predict))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.58      0.83      0.68        23
    Positive       0.76      0.48      0.59        27

    accuracy                           0.64        50
   macro avg       0.67      0.65      0.63        50
weighted avg       0.68      0.64      0.63        50

Confusion Matrix: 
 [[19  4]
 [14 13]]
Accuracy: 
 0.64


In [None]:
# Векторизуем с помощью другого векторизатора
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer(ngram_range=(1, 3), binary=False)
count_vec_train = count_vec.fit_transform(corpus_train)
count_vec_test = count_vec.transform(corpus_test)

linear_svc_count = LinearSVC(C=0.5, random_state=42, max_iter=5000)
linear_svc_count.fit(count_vec_train, train_data_label)
predict_count = linear_svc_count.predict(count_vec_test)

print("Classification Report: \n", classification_report(test_data_label, predict_count,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_count))
print("Accuracy: \n", accuracy_score(test_data_label, predict_count))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.52      0.61      0.56        23
    Positive       0.61      0.52      0.56        27

    accuracy                           0.56        50
   macro avg       0.56      0.56      0.56        50
weighted avg       0.57      0.56      0.56        50

Confusion Matrix: 
 [[14  9]
 [13 14]]
Accuracy: 
 0.56


In [None]:
# И снова другой векторизатор
ind_vec = CountVectorizer(ngram_range=(1, 3), binary=True)
ind_vec_train = ind_vec.fit_transform(corpus_train)
ind_vec_test = ind_vec.transform(corpus_test)

linear_svc_ind = LinearSVC(C=0.5, random_state=42)
linear_svc_ind.fit(ind_vec_train, train_data_label)
predict_ind = linear_svc_ind.predict(ind_vec_test)

print("Classification Report: \n", classification_report(test_data_label, predict_ind,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_ind))
print("Accuracy: \n", accuracy_score(test_data_label, predict_ind))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.54      0.61      0.57        23
    Positive       0.62      0.56      0.59        27

    accuracy                           0.58        50
   macro avg       0.58      0.58      0.58        50
weighted avg       0.59      0.58      0.58        50

Confusion Matrix: 
 [[14  9]
 [12 15]]
Accuracy: 
 0.58


In [None]:
# TF_IDF дал лучший результат, используем его, добаим наивный байесовский классификатор
tfidf_vec_NB = TfidfVectorizer(ngram_range=(1, 1))
tfidf_vec_train_NB = tfidf_vec_NB.fit_transform(corpus_train)

tfidf_vec_test_NB = tfidf_vec_NB.transform(corpus_test)

print(tfidf_vec_train_NB.toarray().shape, tfidf_vec_test_NB.toarray().shape)

(210, 857) (50, 857)


In [None]:
from sklearn.feature_selection import SelectKBest, chi2

ch2 = SelectKBest(chi2, k=50000)
tfidf_vec_train_NB = ch2.fit_transform(tfidf_vec_train_NB, train_data_label)
tfidf_vec_test_NB  = ch2.transform(tfidf_vec_test_NB)



In [None]:
feature_names = tfidf_vec_NB.get_feature_names_out()
feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
feature_names = np.asarray(feature_names)

from sklearn.naive_bayes import MultinomialNB
multi_clf = MultinomialNB()
multi_clf.fit(tfidf_vec_train_NB, train_data_label)
predict_NB = multi_clf.predict(tfidf_vec_test_NB)

print("Classification Report: \n", classification_report(test_data_label, predict_NB,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_NB))
print("Accuracy: \n", accuracy_score(test_data_label, predict_NB))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.55      0.96      0.70        23
    Positive       0.90      0.33      0.49        27

    accuracy                           0.62        50
   macro avg       0.73      0.64      0.59        50
weighted avg       0.74      0.62      0.58        50

Confusion Matrix: 
 [[22  1]
 [18  9]]
Accuracy: 
 0.62


In [None]:
count_vec_NB = CountVectorizer(ngram_range=(1, 3), binary=False)
count_vec_train_NB = count_vec_NB.fit_transform(corpus_train)
count_vec_test_NB = count_vec_NB.transform(corpus_test)

multi_clf_count = MultinomialNB()
multi_clf_count.fit(count_vec_train_NB, train_data_label)
predict_NB_count = multi_clf_count.predict(count_vec_test_NB)

print("Classification Report: \n", classification_report(test_data_label, predict_NB_count,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_NB_count))
print("Accuracy: \n", accuracy_score(test_data_label, predict_NB_count))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.61      0.83      0.70        23
    Positive       0.79      0.56      0.65        27

    accuracy                           0.68        50
   macro avg       0.70      0.69      0.68        50
weighted avg       0.71      0.68      0.68        50

Confusion Matrix: 
 [[19  4]
 [12 15]]
Accuracy: 
 0.68


# Квантовое решение

In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [43]:
df = pd.read_csv("task-3-dataset.csv") # Считываем данные
df2 = pd.read_csv("test50.csv")

In [5]:
!pip install pymorphy3

Collecting pymorphy3
  Downloading pymorphy3-2.0.2-py3-none-any.whl.metadata (1.8 kB)
Collecting dawg-python>=0.7.1 (from pymorphy3)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymorphy3-dicts-ru, dawg-python, pymorphy3
Successfully installed dawg-python-0.7.2 pymorphy3-2.0.2 pymorphy3-dicts-ru-2.4.417150.4580142


In [6]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer(lang='ru')
# Функция загрузки стопслов
def downloads_():
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords

# Функция обработки текта
def foo(review):
    # Обработка текста отзыва. Оставляем только буквы, приводим к нижнему регистру
    review = review.lower()
    review = review.replace('ё','е')
    review = re.sub('\[[^]]*\]', ' ', review)
    review = re.sub('[^а-я]', ' ', review)
    # Отделяем слова
    review = review.split()
    # Избавляемся от стоп-слов(предлоги,союзы, частицы, не несущие семантической нагрузки)
    review = [word for word in review if not word in set(stopwords.words('russian'))]
    # Лемматизируем(для русского языка в явном виде нет, но пока тк)
    # В явном виде лемматизации нет для русского языка,  SNOWBALL STEMMER как вариант
    lemmatized_words = [morph.normal_forms(word)[0] for word in review]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

In [46]:
# Разделим данные, пока нет тестовых
from sklearn.model_selection import train_test_split
df["разметка"].loc[df["разметка"]=="+"]=1
df["разметка"].loc[df["разметка"]=="-"]=0
df["отзывы"] = df["отзывы"].apply(foo)

df2["разметка"].loc[df2["разметка"]=="+"]=1
df2["разметка"].loc[df2["разметка"]=="-"]=0
df2["Отзывы"] = df2["Отзывы"].apply(foo)

#dataset_train, dataset_test, train_data_label, test_data_label = train_test_split(df['отзывы'], df['разметка'], test_size=0.2, random_state=42)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["разметка"].loc[df["разметка"]=="+"]=1
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame 

In [10]:
#Tokenization of text
tokenizer=ToktokTokenizer()

In [11]:
downloads_()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(min_df=0.0,max_df=1.0,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(df['отзывы'])
#transformed test reviews
print('Tfidf_train:',tv_train_reviews.shape)

Tfidf_train: (210, 3884)


In [30]:
# Применим метод главных компонент для того, чтобы сократить вектор TF-IDF
from sklearn.decomposition import PCA
def reduce_dimensionality(X, n_components=4): # число кубит
    pca = PCA(n_components=n_components)
    return pca.fit_transform(X)

In [31]:
# Обрежем данные
X = tv_train_reviews.copy()
y = df['разметка']
X_reduced = reduce_dimensionality(X)
X_train = X_reduced.copy()
y_train = y.copy()
#X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2)

In [32]:
!pip install qiskit



In [33]:
!pip install qiskit_aer



In [34]:
!pip install qiskit_machine_learning



In [35]:
import qiskit
from qiskit_aer import Aer
from qiskit.circuit import QuantumCircuit, Parameter
from qiskit.primitives import Sampler, Estimator
#from qiskit.algorithms.optimizers import COBYLA
from qiskit_machine_learning.neural_networks import SamplerQNN, EstimatorQNN
from qiskit_machine_learning.connectors import TorchConnector
from qiskit_machine_learning.circuit.library import QNNCircuit

In [36]:
# Создадим квантовую схему
n_qubits = 2 # Число кубит
circuit = QuantumCircuit(n_qubits)
# Параметры для схемы
params = [Parameter(f'theta_{i}') for i in range(n_qubits * 2)]
# Кодирование данных
for i in range(n_qubits):
    circuit.ry(params[i], i)
# Параметризированные вращения
for i in range(n_qubits - 1):
    circuit.cx(i, i + 1)
# Добавляем измерения
circuit.measure_all()
# Рисуем схему в текстовом формате
print(circuit.draw(output='text'))

        ┌─────────────┐      ░ ┌─┐   
   q_0: ┤ Ry(theta_0) ├──■───░─┤M├───
        ├─────────────┤┌─┴─┐ ░ └╥┘┌─┐
   q_1: ┤ Ry(theta_1) ├┤ X ├─░──╫─┤M├
        └─────────────┘└───┘ ░  ║ └╥┘
meas: 2/════════════════════════╩══╩═
                                0  1 


In [37]:
from qiskit.circuit.library import RealAmplitudes, ZZFeatureMap

num_inputs = 4
feature_map = ZZFeatureMap(num_inputs)
ansatz = RealAmplitudes(num_inputs,reps=1)

circuit = QuantumCircuit(num_inputs)
circuit.compose(feature_map, inplace=True)
circuit.compose(ansatz, inplace=True)

def parity(x):
    return "{:b}".format(x).count("1") % 2

In [38]:
from qiskit_machine_learning.neural_networks import SamplerQNN
from qiskit.primitives import Sampler

sampler = Sampler()
sampler_qnn = SamplerQNN(
    circuit=circuit,
    input_params=feature_map.parameters,
    weight_params=ansatz.parameters,
    interpret=parity,
    output_shape=2,
    sampler=sampler,
)

  sampler = Sampler()
  sampler_qnn = SamplerQNN(


In [39]:
from qiskit_machine_learning.algorithms import NeuralNetworkClassifier
classifier = NeuralNetworkClassifier(
    neural_network=sampler_qnn,
    loss="cross_entropy",
    one_hot=True,
)
classifier.fit(np.asarray(X_train), np.asarray(y_train))

<qiskit_machine_learning.algorithms.classifiers.neural_network_classifier.NeuralNetworkClassifier at 0x788291001030>

In [42]:
df = pd.read_csv("test50.csv") # Считываем данные

In [49]:
tv2 = tv.fit_transform(df2['Отзывы'])
X_test = tv2.copy()
y_test = df2['разметка']
X_test = reduce_dimensionality(X_test)

In [50]:
y_pred = classifier.predict(X_test)
y_pred = [int(i) for i in y_pred]
y_true = [int(i) for i in y_test]

In [51]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Вывод точности
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Вывод precision, recall и f1-score
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.48
Precision: 0.49
Recall: 0.48
F1 Score: 0.47
