# Лабораторная № 2. Векторное представление текстов

## 1. Скачаем данные отзывов о банковских услугах

In [None]:
import pandas as pd
file_path = 'financial_review.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,bank_href,login,review,bank_name,site,date,message_href,score,status,user_href
0,/kredit-evropa-bank.html,,"Брал кредит в данной организации, после погаше...",кредит европа банк,http://bankireview.ru,,,,,
1,/kredit-evropa-bank.html,,"В 2011 году,мною была оформлена мега карта в Т...",кредит европа банк,http://bankireview.ru,,,,,
2,/kredit-evropa-bank.html,,С родителями увидели рекламу вклада Осеннее пр...,кредит европа банк,http://bankireview.ru,,,,,
3,/kredit-evropa-bank.html,,Связать три года своей жизни с этим непорядочн...,кредит европа банк,http://bankireview.ru,,,,,
4,/kredit-evropa-bank.html,,"Пользуюсь месяц карточкой Card Plus, сегодня п...",кредит европа банк,http://bankireview.ru,,,,,


## 2. Векторное представление

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Преобразование типа данных столбца 'score' в числовой
data['score'] = pd.to_numeric(data['score'], errors='coerce')

In [None]:
# Преобразование оценок в бинарные метки
data['label'] = data['score'].apply(lambda x: 1 if x > 3 else 0)

In [None]:
y = data['label'].tolist()
X = data["review"].tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# BOW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer_bow = CountVectorizer(max_features=100, ngram_range=(1, 1))

In [None]:
vectorizer_bow.fit(X_train)

In [None]:
X_train_bow = vectorizer_bow.transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer_tftidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))

In [None]:
vectorizer_tftidf.fit(X_train)

In [None]:
X_train_tfidf = vectorizer_tftidf.transform(X_train)
X_test_tfidf = vectorizer_tftidf.transform(X_test)

# Word2vec

In [None]:
import gensim
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
!wget -c "http://vectors.nlpl.eu/repository/20/180.zip" -O rusvectores_w2v.zip


--2024-02-21 13:34:55--  http://vectors.nlpl.eu/repository/20/180.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 484452317 (462M) [application/zip]
Saving to: ‘rusvectores_w2v.zip’


2024-02-21 13:35:15 (24.2 MB/s) - ‘rusvectores_w2v.zip’ saved [484452317/484452317]



In [None]:
!unzip rusvectores_w2v.zip


Archive:  rusvectores_w2v.zip
  inflating: meta.json               
  inflating: model.bin               
  inflating: model.txt               
  inflating: README                  


In [None]:
from gensim.models import KeyedVectors

model_path = 'model.bin'
model = KeyedVectors.load_word2vec_format(model_path, binary=True)


In [None]:
model.vectors.shape

(189193, 300)

In [None]:
list(model.index_to_key)[:20]

['так_ADV',
 'быть_VERB',
 'мочь_VERB',
 'год_NOUN',
 'человек_NOUN',
 'xxxxxx_NUM',
 'сказать_VERB',
 'еще_ADV',
 'один_NUM',
 'говорить_VERB',
 'уже_ADV',
 'другой_ADJ',
 'время_NOUN',
 'xxxxxxxx_NUM',
 'знать_VERB',
 'сам_ADJ',
 'самый_ADJ',
 'делать_VERB',
 'дело_NOUN',
 'день_NOUN']

In [None]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def preprocess_text(text):
    processed_text = []
    for word in text.split():
        p = morph.parse(word)[0]
        pos = p.tag.POS
        word_tag = f"{p.normal_form}_{pos}" if pos else p.normal_form
        processed_text.append(word_tag)
    return processed_text

def text_to_vector(text, model):
    vectors = [model[word] for word in preprocess_text(text) if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X_train_w2v = np.array([text_to_vector(text, model) for text in X_train])
X_test_w2v = np.array([text_to_vector(text, model) for text in X_test])

# 4. Классификация

In [None]:
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, roc_curve

In [None]:
forest_bow = RandomForestClassifier(n_estimators=100, random_state=0)
forest_bow.fit(X_train_bow, y_train)
y_pred_bow = forest_bow.predict(X_test_bow)
y_pred_prob_bow = forest_bow.predict_proba(X_test_bow)[:, 1]

In [None]:
forest_tfidf = RandomForestClassifier(n_estimators=100, random_state=0)
forest_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = forest_tfidf.predict(X_test_tfidf)
y_pred_prob_tfidf = forest_tfidf.predict_proba(X_test_tfidf)[:, 1]

In [None]:
forest_w2v = RandomForestClassifier(n_estimators=100, random_state=0)
forest_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = forest_w2v.predict(X_test_w2v)
y_pred_prob_w2v = forest_w2v.predict_proba(X_test_w2v)[:, 1]

# 5. Анализ точности

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, y_pred_bow)

array([[4787,  111],
       [ 612,  443]])

In [None]:
confusion_matrix(y_test, y_pred_tfidf)

array([[4808,   90],
       [ 413,  642]])

In [None]:
confusion_matrix(y_test, y_pred_w2v)

array([[4838,   60],
       [ 683,  372]])

In [None]:
fig = go.Figure()
fig.add_shape(type="line", line=dict(dash="dash"), x0=0, x1=1, y0=0, y1=1)
name = "BoW"
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_bow)
auc_value = round(auc(fpr, tpr), 2)
fig.add_trace(go.Scatter(x=fpr, y=tpr, name=f"{name}: {auc_value}", mode="lines"))

name = "TF-IDF"
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_tfidf)
auc_value = round(auc(fpr, tpr), 2)
fig.add_trace(go.Scatter(x=fpr, y=tpr, name=f"{name}: {auc_value}", mode="lines"))

name = "Word2Vec"
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_w2v)
auc_value = round(auc(fpr, tpr), 2)
fig.add_trace(go.Scatter(x=fpr, y=tpr, name=f"{name}: {auc_value}", mode="lines"))

# Вывод графика
fig.update_layout(
    title="ROC Curves for Different Vector Representations",
    width=600,
    height=600,
    template="plotly_white",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    legend_title="Legend",
    legend=dict(yanchor="bottom", y=0.01, xanchor="right", x=0.99),
)

fig.show()

При анализе точности классификации при использовании разных векторных представлений модель с использованием TF-IDF показала лучшие результаты.
