# Load Dataset

In [1]:
# !pip install .

## Import libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import datasets
from datasets import Dataset, DatasetDict
from datasets import load_dataset

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import load_model

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

In [2]:
hf_dataset = load_dataset("fthbrmnby/turkish_product_reviews", download_mode="force_redownload")
hf_dataset['train'].features['sentiment']

Generating train split: 100%|██████████| 235165/235165 [00:00<00:00, 977623.32 examples/s]


ClassLabel(names=['negative', 'positive'], id=None)

In [3]:
hf_dataset.set_format(type="pandas")

In [4]:
def label_int2str(row):
    return hf_dataset["train"].features["sentiment"].int2str(row)

In [5]:
from sklearn.model_selection import train_test_split

df_all = hf_dataset["train"][:].sample(frac=0.1).copy()
train_df, test_df = train_test_split(df_all, test_size=0.3, random_state=42, stratify=df_all['sentiment'])
print(train_df['sentiment'].value_counts())
print(test_df['sentiment'].value_counts())
X_train = train_df['sentence'].copy()
y_train = train_df['sentiment'].copy()
X_test = test_df['sentence'].copy()
y_test = test_df['sentiment'].copy()

sentiment
1    15438
0     1023
Name: count, dtype: int64
sentiment
1    6616
0     439
Name: count, dtype: int64


# Modelling

In [6]:
# define model
model = BertClassifier()         # text/text pair classification
# try different options...
model.bert_model = 'bert-base-turkish-128k-uncased'
model.num_mlp_layers = 1
model.max_seq_length = 128
model.epochs = 1
model.learning_rate = 1e-1
model.gradient_accumulation_steps = 1
# finetune model
model.fit(X_train, y_train)

Building sklearn text classifier...
Loading bert-base-turkish-128k-uncased model...
Using mlp with D=768,H=500,K=2,n=1
Loading Pytorch checkpoint


  state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)


train data size: 14815, validation data size: 1646


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha = 1) (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\utils\python_arg_parser.cpp:1581.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Training  : 100%|██████████| 462/462 [09:38<00:00,  1.25s/it, loss=0.441]
Validating: 100%|██████████| 206/206 [00:28<00:00,  7.13it/s]

Epoch 1, Train loss: 0.4414, Val loss: 0.2457, Val accy: 93.32%





In [7]:
# make predictions
y_pred = model.predict(X_test)

# make probabilty predictions
y_pred = model.predict_proba(X_test)

# score model on test data
model.score(X_test, y_test)

Predicting: 100%|██████████| 882/882 [00:56<00:00, 15.55it/s]
Predicting: 100%|██████████| 882/882 [00:55<00:00, 15.77it/s]
Testing: 100%|██████████| 882/882 [00:55<00:00, 16.00it/s]


Loss: 0.2332, Accuracy: 93.78%





93.77746279234586