In [29]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import roc_auc_score, accuracy_score
from lightgbm import LGBMClassifier
import numpy as np

import joblib
import bz2
import regex
from tqdm import tqdm

In [20]:
responses = []
with open("../data/banki_responses.json", "r") as f:
    for line in tqdm(f):
        resp = json.loads(line)
        if not resp['rating_not_checked']:
            responses.append(resp)

201030it [00:04, 47162.14it/s]


In [21]:
responses = list(filter(lambda r: r['rating_grade'] is not None, responses))
texts = list(map(lambda r: regex.sub(r'[^\p{Cyrillic}]', ' ', r['text'].lower()), responses))
ratings = list(map(lambda r: r['rating_grade'], responses))

In [24]:
vectorizer = TfidfVectorizer(encoding='utf8', min_df=5)
_ = vectorizer.fit(texts)

In [41]:
joblib.dump(vectorizer, "../src/weights/tfifd.joblib")

['../src/weights/tfifd.joblib']

In [27]:
X = vectorizer.transform(texts)
Y = (np.array(ratings) > 3).astype(int)

In [31]:
cv = ShuffleSplit(n_splits=3, test_size=0.3)
for train_ids, test_ids in cv.split(X):
    gb = LGBMClassifier(n_estimators=500, device_type="gpu")
    gb.fit(X[train_ids], Y[train_ids])
    preds = gb.predict_proba(X[test_ids])[:,1]
    print('ROC-AUC: %.3f, ACC: %.3f' % (roc_auc_score(Y[test_ids], preds), 
                                        accuracy_score(Y[test_ids], (preds > 0.5).astype(int))))

[LightGBM] [Info] Number of positive: 13012, number of negative: 49048
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1469711
[LightGBM] [Info] Number of data points in the train set: 62060, number of used features: 27180
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 25 dense feature groups (1.66 MB) transferred to GPU in 0.001822 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209668 -> initscore=-1.326927
[LightGBM] [Info] Start training from score -1.326927
ROC-AUC: 0.970, ACC: 0.945
[LightGBM] [Info] Number of positive: 12980, number of negative: 49080
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1470962
[LightGBM] [Info] Number of data points in the train set: 62060, number of used fea

In [32]:
gb = LGBMClassifier(n_estimators=500, device_type="gpu")
gb.fit(X[train_ids], Y[train_ids])

[LightGBM] [Info] Number of positive: 13001, number of negative: 49059
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1470258
[LightGBM] [Info] Number of data points in the train set: 62060, number of used features: 27218
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 25 dense feature groups (1.66 MB) transferred to GPU in 0.002327 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209491 -> initscore=-1.327997
[LightGBM] [Info] Start training from score -1.327997


In [34]:
joblib.dump(gb, "../src/weights/gradboost.joblib")

['gradboost.joblib']