In [20]:
import re
import torch
import nltk
import numpy as np
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, BertTokenizer, BertModel, AutoModel, AutoTokenizer
from preprocess import preprocess
from skmultilearn.adapt import MLkNN, MLARAM
from skmultilearn.problem_transform import ClassifierChain
from sklearn.svm import SVC
lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from bert_base_cased import BertBaseCased
from bert_dataset import BertDataset
from bert_test import bert_test
from bert_train import bert_train
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, hamming_loss
from sklearn.ensemble import RandomForestRegressor

STOPWORDS = set(stopwords.words('english'))

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package stopwords to /home/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/stepan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [21]:
device

'cuda'

In [22]:
df = pd.read_csv('./datasets/electronics_1/final_data.csv')
df = df.dropna()
df = df[df['Category'].str.startswith('Categories')]
df = df.rename(columns={ 'Description': 'description', 'Category': 'category' })
df

Unnamed: 0,description,category
0,"KeePass Password Safe is a free, open source, ...","Categories\nOffice/Business, Database, Securit..."
1,XAMPP is a very easy to install Apache Distrib...,"Categories\nDatabase Engines/Servers, Dynamic ..."
2,Technical analysis library with indicators lik...,"Categories\nInvestment Management, Spreadsheet..."
3,AppServ is an merging open source software ins...,"Categories\nDatabase Engines/Servers, Site Man..."
4,LibreOffice is a free and powerful office suit...,"Categories\nWord Processors, Office Suites, Fr..."
...,...,...
33765,Find recursively corrupted or password protect...,Categories\nFile Managers
33766,*** This repository has been migrated here: ht...,Categories\nEmulators
33767,Phoebetria lets you take control of your BitFe...,"Categories\nHardware, Hardware Drivers"
33768,- English (Thanks Steve for translation this t...,"Categories\nSoftware Distribution, Live CD"


In [16]:
def prepare_category(x):
    result = x.split('\n')
    result.pop(0)
    return ''.join(result).split(', ')

df['category'] = df['category'].apply(prepare_category)

In [17]:
df['category_count'] = df['category'].str.len()

In [18]:
df = df[(df['category_count'] > 1) & (df['category_count'] < 7)]
df = df.reset_index()
df = df.drop(['index', 'category_count'], axis=1)
df

Unnamed: 0,description,category
0,"KeePass Password Safe is a free, open source, ...","[Office/Business, Database, Security, Desktop ..."
1,XAMPP is a very easy to install Apache Distrib...,"[Database Engines/Servers, Dynamic Content, Si..."
2,Technical analysis library with indicators lik...,"[Investment Management, Spreadsheet, Front-End..."
3,AppServ is an merging open source software ins...,"[Database Engines/Servers, Site Management, HT..."
4,LibreOffice is a free and powerful office suit...,"[Word Processors, Office Suites, Front-Ends]"
...,...,...
23675,"The eznixOS is a respin of Debian GNU/Linux, c...","[OS distribution, Live CD]"
23676,Denise is a cycle accurate and platform indepe...,"[Games/Entertainment, Emulators]"
23677,Phoebetria lets you take control of your BitFe...,"[Hardware, Hardware Drivers]"
23678,- English (Thanks Steve for translation this t...,"[Software Distribution, Live CD]"


In [19]:
df['category'].str.len().value_counts()

category
3    10958
2     9156
4     2220
5      752
6      594
Name: count, dtype: int64

In [7]:
categories = sorted(list(set([x for batch in df['category'].tolist() for x in batch])))

for category in categories:
    df[category] = df['category'].apply(lambda x: 1 if category in x else 0)

  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['category'].apply(lambda x: 1 if category in x else 0)
  df[category] = df['cate

In [8]:
X_train, X_test = train_test_split(df, shuffle=True, test_size=0.2)

X_train.shape, X_test.shape

X_train = X_train.reset_index()
X_test = X_test.reset_index()

X_train.shape, X_test.shape

((18944, 502), (4736, 502))

In [9]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 2e-5
NUM_CLASSES = len(categories)

train_dataset = BertDataset(X_train, MAX_LEN, tokenizer, categories)
test_dataset = BertDataset(X_test, MAX_LEN, tokenizer, categories)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                          num_workers=4, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         num_workers=4, shuffle=False, pin_memory=True)

In [10]:
bert_model = BertBaseCased(NUM_CLASSES)
# model.load_state_dict(torch.load('Luxury_apparel_BERT.bin', map_location=torch.device(device)))
bert_model = bert_model.to(device)

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.AdamW(params=bert_model.parameters(),
                  lr=LEARNING_RATE, weight_decay=1e-6)

In [12]:
for epoch in range(EPOCHS):
    bert_train(
        model=bert_model,
        train_loader=train_loader,
        optimizer=optimizer,
        loss_fn=loss_fn,
        epoch=epoch,
        device=device,
    )

Batch: 0
Epoch: 0, Loss:  0.7233775854110718
Batch: 1
Batch: 2
Batch: 3
Batch: 4
Batch: 5


KeyboardInterrupt: 