In [3]:
import pandas as pd
import re
import pymorphy2
from catboost import CatBoostClassifier
import nltk
import spacy
import numpy as np
from collections import Counter
from sklearn import tree
import matplotlib.pyplot as plt
from IPython.display import SVG
from graphviz import Source
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import mean_squared_error, r2_score
import warnings
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
    StratifiedKFold,
    learning_curve,
)
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    roc_auc_score,
    roc_curve,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
)



pd.options.display.max_columns = 100
morph = pymorphy2.MorphAnalyzer()
nlp = spacy.blank("ru")
nltk.download("vader_lexicon")
nltk.download("punkt")
nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words("russian")
banks = [
    "Сбербанк",
    "ВТБ",
    "Альфа",
    "Тинькофф",
    "Сити",
    "Россельхозбанк",
    "Росбанк",
    "Райф",
    "МТС",
    "ИКС",
    "Уралсиб",
    "Банк России",
    "Банк Открытие",
]
warnings.filterwarnings("ignore")
%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/egorkozlov/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/egorkozlov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/egorkozlov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv("train.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,sentence,1category,2category,sentiment
0,4754,При этом всегда получал качественные услуги.,Communication,,+
1,4417,"Не вижу, за что хотя бы 2 поставить, сервис на 1!",?,,−
2,3629,"Вот так ""Мой любимый"" банк МКБ меня обманул.",?,,−
3,11640,Отвратительное отношение к клиентам.,Communication,,−
4,5571,"Всегда в любое время дня и ночи помогут, ответ...",Communication,,+


### Делаем наборы слов


In [6]:
def worder(df):
    result = " ".join(df.where(df["sentiment"] == "+").dropna(how="all")["sentence"])
    words = re.findall(r"\b\w+\b", result.lower())
    words = [word for word in words if word not in stopwords]
    words = [morph.parse(word)[0].normal_form for word in words]
    good = Counter(words)

    result = " ".join(df.where(df["sentiment"] == "−").dropna(how="all")["sentence"])
    words = re.findall(r"\b\w+\b", result.lower())
    words = [word for word in words if word not in stopwords]
    words = [morph.parse(word)[0].normal_form for word in words]
    bad = Counter(words)

    bad = bad.most_common(100)

    good = good.most_common(100)

    good_list = []
    bad_list = []
    for i in range(100):
        good_list.append(good[i][0])
        bad_list.append(bad[i][0])
    # Преобразование списков в множества
    set1 = set(good_list)
    set2 = set(bad_list)

    # Получение уникальных элементов каждого множества
    unique_set1 = set1.difference(set2)
    unique_set2 = set2.difference(set1)

    # Преобразование множеств обратно в списки
    only_good = list(unique_set1)
    only_bad = list(unique_set2)
    return only_good, only_bad

In [7]:
only_good, only_bad = worder(df)

### Генерируем признаки

In [8]:
# Функция, которая проверяет наличие даты в формате xx.xx.xxxx в строке
def has_date(date_string):
    patterns = [
        r"\b\d{2}\.\d{2}\.\d{4}\b",  # Формат xx.xx.xxxx
        r"\b\d{2}[-/]\d{2}[-/]\d{2}\b",  # Формат xx-xx-xx или xx/xx/xx
        # Дополнительные паттерны, включая варианты с названиями месяцев на русском языке
        r"\b\d{1,2} [Яя]нвар[яь] \d{4}\b",
        r"\b\d{1,2} [Фф]еврал[яь] \d{4}\b",
        r"\b\d{1,2} [Мм]арт[а] \d{4}\b",
        r"\b\d{1,2} [Аа]прел[яь] \d{4}\b",
        r"\b\d{1,2} [Мм]а[яй] \d{4}\b",
        r"\b\d{1,2} [Ии]юн[яь] \d{4}\b",
        r"\b\d{1,2} [Ии]юл[яь] \d{4}\b",
        r"\b\d{1,2} [Аа]вгуст[а] \d{4}\b",
        r"\b\d{1,2} [Сс]ентябр[яь] \d{4}\b",
        r"\b\d{1,2} [Оо]ктябр[яь] \d{4}\b",
        r"\b\d{1,2} [Нн]оябр[яь] \d{4}\b",
        r"\b\d{1,2} [Дд]екабр[яь] \d{4}\b",
    ]
    for pattern in patterns:
        match = re.search(pattern, date_string)
        if match:
            return 1
    return 0

# Функция, которая проверяет наличие даты в формате xx.xx.xxxx в строке
def has_ad(date_string):
    address_patterns = [
        r"([А-ЯЁ][а-яё]+(?:-+[А-ЯЁ][а-яё]+)*(?: [0-9]+[а-яё]*)(?:/[0-9]+)*)",
        r"([А-ЯЁ][а-яё]+(?:-+[А-ЯЁ][а-яё]+)*)[, ]+([А-ЯЁ][а-яё]+(?:-+[А-ЯЁ][а-яё]+)*(?: [0-9]+[а-яё]*)(?:/[0-9]+)*)",
        r"([А-ЯЁ][а-яё]+(?:-+[А-ЯЁ][а-яё]+)*(?: [0-9]+[а-яё]*)(?:/[0-9]+)*)(?:,| в)+([А-ЯЁ][а-яё]+(?:-+[А-ЯЁ][а-яё]+)*)",
    ]
    for pattern in address_patterns:
        match = re.search(pattern, date_string)
        if match is not None:
            return 1
    return 0

def has_phone_number(date_string):
    patterns = [r"\+7\d{10}", r"8\d{10}", r"8\s?\d{3}\s?\d{3}-?\d{2}-?\d{2}", r"\d{11}"]
    for pattern in patterns:
        match = re.search(pattern, date_string)
        if match is not None:
            return 1
    return 0

def len_review(date_string):
    return len(date_string)

def warning_sign1(date_string):
    if "!" in date_string:
        return 1
    else:
        return 0

def count_warning_signs(s):
    return s.count("!")

def caps_words_count(df):
    caps_words_count = []
    for sentence in df["sentence"]:
        caps_words = [word for word in sentence.split() if word.isupper()]
        caps_words_count.append(len(caps_words))
    df["caps_words_count"] = caps_words_count
    return df

def get_bracket_value(text):
    count_open = text.count("(")
    count_close = text.count(")")
    if count_open > count_close:
        return 1
    elif count_open < count_close:
        return 2
    else:
        return 0

def has_english_letters(s):
    """
    Функция принимает строку s и возвращает 1, если в ней есть английские буквы,
    и возвращает 0 в противном случае.
    """
    for char in s:
        if char.isalpha() and char.isascii():
            # Проверяем, является ли символ буквой и английским символом
            return 1
    return 0

def count_answer_sign(s):
    return s.count("!")

def count_dots(s):
    return s.count(".")

def count_minus(s):
    return s.count("-")

def count_plus(s):
    return s.count("+")

def гub(date_string):
    if "руб" in date_string or "тыс" in date_string:
        return 1
    else:
        return 0
        
df["num_digits"] = df["sentence"].str.count(r"\d")
df["has_date"] = df["sentence"].apply(has_date)
df["has_ad"] = df["sentence"].apply(has_ad)
df["has_phone_number"] = df["sentence"].apply(has_phone_number)
df["len_review"] = df["sentence"].apply(len_review)
df["count_warning_signs"] = df["sentence"].apply(count_warning_signs)
df = caps_words_count(df)
df["brackets"] = df["sentence"].apply(get_bracket_value)
df["has_english_letters"] = df["sentence"].apply(has_english_letters)
df["count_answer_sign"] = df["sentence"].apply(count_answer_sign)
df["count_dots"] = df["sentence"].apply(count_dots)
df["count_minus"] = df["sentence"].apply(count_minus)
df["count_plus"] = df["sentence"].apply(count_plus)
df["гub"] = df["sentence"].apply(гub)

In [9]:
df

Unnamed: 0.1,Unnamed: 0,sentence,1category,2category,sentiment,num_digits,has_date,has_ad,has_phone_number,len_review,count_warning_signs,caps_words_count,brackets,has_english_letters,count_answer_sign,count_dots,count_minus,count_plus,гub
0,4754,При этом всегда получал качественные услуги.,Communication,,+,0,0,0,0,44,0,0,0,0,0,1,0,0,0
1,4417,"Не вижу, за что хотя бы 2 поставить, сервис на 1!",?,,−,2,0,0,0,49,1,0,0,0,1,0,0,0,0
2,3629,"Вот так ""Мой любимый"" банк МКБ меня обманул.",?,,−,0,0,0,0,44,0,1,0,0,0,1,0,0,0
3,11640,Отвратительное отношение к клиентам.,Communication,,−,0,0,0,0,36,0,0,0,0,0,1,0,0,0
4,5571,"Всегда в любое время дня и ночи помогут, ответ...",Communication,,+,0,0,0,0,56,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19356,8004,Никогда и ни в коем случае не открывайте счет ...,Communication,,−,0,0,0,0,109,1,0,0,0,1,1,0,0,0
19357,18182,ТИ откровенно забили на качество и развивают с...,Quality,,−,0,0,0,0,150,0,1,0,0,0,1,1,0,0
19358,744,"Я считаю, это прорыв и лидерство финансовых ус...",?,,+,0,0,0,0,109,1,1,0,0,1,0,0,0,0
19359,6220,"Писал мужчина очень доходчиво, не финансовым я...",Communication,,+,0,0,0,0,130,1,0,0,0,1,0,0,0,0


### Настроение отзыва

In [10]:
def mood(df):
    good_words_count = []
    bad_words_count = []
    result = " ".join(df.where(df["sentiment"] == "+").dropna(how="all")["sentence"])
    words = re.findall(r"\b\w+\b", result.lower())
    words = [word for word in words if word not in stopwords]
    words = [morph.parse(word)[0].normal_form for word in words]
    for sentence in df["sentence"]:
        words = re.findall(r"\b\w+\b", sentence.lower())
        words = [word for word in words if word not in stopwords]
        words = [morph.parse(word)[0].normal_form for word in words]
        good_counts = {word: words.count(word) for word in only_good}
        bad_counts = {word: words.count(word) for word in only_bad}
        good_words_count.append(good_counts)
        bad_words_count.append(bad_counts)
    df["good_words"] = good_words_count
    df["bad_words"] = bad_words_count
    df["good_c"] = 0
    df["bad_c"] = 0
    for i in range(len(df.index)):
        df["good_c"][df.index[i]] = sum(df["good_words"][df.index[i]].values())
        df["bad_c"][df.index[i]] = sum(df["bad_words"][df.index[i]].values())
    return df

df = mood(df)

### Названия банков

In [11]:
def bank_naming(df):
    def find_bank_names(docs):
        bank_names = []
        for doc in docs:
            found = False
            for token in doc:
                if token.text in banks:
                    found = True
                    bank_names.append(token.text)
                    break
            if not found:
                bank_names.append(None)
        return bank_names

    def mark_bank_name(df):
        docs = list(nlp.pipe(df["sentence"]))
        df["bank_name"] = find_bank_names(docs)
        return df

    df = mark_bank_name(df)
    groupby_bank = (
        df.groupby("bank_name")[["1category", "2category", "sentiment"]]
        .agg(lambda x: x.mode().iloc[0])
        .reset_index()
    )
    merged = pd.merge(df, groupby_bank, on="bank_name", how="left")
    for bank in banks:
        col_name = f"{bank}_1category_mode"
        merged[col_name] = merged["1category_y"].where(
            merged["bank_name"] == bank, np.nan
        )
        col_name = f"{bank}_2category_mode"
        merged[col_name] = merged["2category_y"].where(
            merged["bank_name"] == bank, np.nan
        )
        col_name = f"{bank}_sentiment_mode"
        merged[col_name] = merged["sentiment_y"].where(
            merged["bank_name"] == bank, np.nan
        )
    merged = merged.drop(columns=(["1category_y", "2category_y", "sentiment_y"]))
    bank_map = {bank_name: i for i, bank_name in enumerate(banks)}
    df["bank_name"] = df["bank_name"].map(bank_map)
    df["bank_name"].fillna(0, inplace=True)

    return df

df = bank_naming(df)

In [12]:
x = df.drop(
    columns=[
        "Unnamed: 0",
        "sentence",
        "1category",
        "2category",
        "sentiment",
        "good_words",
        "bad_words",
    ]
)
y = pd.DataFrame(df["sentiment"])
y["sentiment"] = y["sentiment"].replace({"+": 1, "−": 0, "?": 2})
y = y.astype(float)

In [13]:
df

Unnamed: 0.1,Unnamed: 0,sentence,1category,2category,sentiment,num_digits,has_date,has_ad,has_phone_number,len_review,count_warning_signs,caps_words_count,brackets,has_english_letters,count_answer_sign,count_dots,count_minus,count_plus,гub,good_words,bad_words,good_c,bad_c,bank_name
0,4754,При этом всегда получал качественные услуги.,Communication,,+,0,0,0,0,44,0,0,0,0,0,1,0,0,0,"{'вежливый': 0, 'огромный': 0, 'ответить': 0, ...","{'итог': 0, 'банкомат': 0, 'написать': 0, 'гор...",0,0,0.0
1,4417,"Не вижу, за что хотя бы 2 поставить, сервис на 1!",?,,−,2,0,0,0,49,1,0,0,0,1,0,0,0,0,"{'вежливый': 0, 'огромный': 0, 'ответить': 0, ...","{'итог': 0, 'банкомат': 0, 'написать': 0, 'гор...",0,2,0.0
2,3629,"Вот так ""Мой любимый"" банк МКБ меня обманул.",?,,−,0,0,0,0,44,0,1,0,0,0,1,0,0,0,"{'вежливый': 0, 'огромный': 0, 'ответить': 0, ...","{'итог': 0, 'банкомат': 0, 'написать': 0, 'гор...",0,0,0.0
3,11640,Отвратительное отношение к клиентам.,Communication,,−,0,0,0,0,36,0,0,0,0,0,1,0,0,0,"{'вежливый': 0, 'огромный': 0, 'ответить': 0, ...","{'итог': 0, 'банкомат': 0, 'написать': 0, 'гор...",0,1,0.0
4,5571,"Всегда в любое время дня и ночи помогут, ответ...",Communication,,+,0,0,0,0,56,0,0,0,0,0,1,0,0,0,"{'вежливый': 0, 'огромный': 0, 'ответить': 1, ...","{'итог': 0, 'банкомат': 0, 'написать': 0, 'гор...",3,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19356,8004,Никогда и ни в коем случае не открывайте счет ...,Communication,,−,0,0,0,0,109,1,0,0,0,1,1,0,0,0,"{'вежливый': 0, 'огромный': 0, 'ответить': 0, ...","{'итог': 0, 'банкомат': 0, 'написать': 0, 'гор...",0,0,0.0
19357,18182,ТИ откровенно забили на качество и развивают с...,Quality,,−,0,0,0,0,150,0,1,0,0,0,1,1,0,0,"{'вежливый': 0, 'огромный': 0, 'ответить': 0, ...","{'итог': 0, 'банкомат': 0, 'написать': 0, 'гор...",1,0,0.0
19358,744,"Я считаю, это прорыв и лидерство финансовых ус...",?,,+,0,0,0,0,109,1,1,0,0,1,0,0,0,0,"{'вежливый': 0, 'огромный': 0, 'ответить': 0, ...","{'итог': 0, 'банкомат': 0, 'написать': 0, 'гор...",0,1,0.0
19359,6220,"Писал мужчина очень доходчиво, не финансовым я...",Communication,,+,0,0,0,0,130,1,0,0,0,1,0,0,0,0,"{'вежливый': 0, 'огромный': 0, 'ответить': 0, ...","{'итог': 0, 'банкомат': 0, 'написать': 0, 'гор...",3,2,0.0


### Обучение модели

In [14]:
# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(x, y.values.ravel(), test_size=0.1)

# Создание объекта классификатора RandomForestClassifier
clf = RandomForestClassifier()

# Определение сетки гиперпараметров для GridSearchCV
param_grid = {"n_estimators": [50, 75, 100, 125, 150], "max_depth": [10, 15, 20]}

# Создание объекта GridSearchCV
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10, scoring="roc_auc_ovr")

# Обучение классификатора с использованием GridSearchCV
grid_search.fit(X_train, y_train)

# Вычисление ROC AUC для многоклассовой классификации с методом OvR
y_pred_proba = grid_search.predict_proba(X_test)
roc_auc_ovr = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")

# Вывод результатов
print("Best parameters:", grid_search.best_params_)
print("ROC AUC (OvR):", roc_auc_ovr)

Best parameters: {'max_depth': 20, 'n_estimators': 150}
ROC AUC (OvR): 0.9028571074929083


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

clf = RandomForestClassifier(n_estimators=100, max_depth=10)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10)

In [16]:
y_pred_proba = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")
print("ROC AUC score:", roc_auc)

ROC AUC score: 0.8381052548153698
