# Классификация товаров
Основная идея данной части проекта - создание модели, классифицирующей текст-описание товаров в 4 категории (Electronics, Household, Books, and Clothing & Accessories)


## Начало
Подключим необходимые библиотеки, обьявим необходимые функции и константы.

In [1]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.dataset import Dataset

import string, nltk
from string import punctuation
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
tokenizer = RegexpTokenizer("[\w]+")
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer(max_features= 12000, ngram_range = (1, 1))
BATCH_SIZE = 64

## Представление текста
Модели работают только с числовым представлением текстов, для чего необходимо строить их вектора, а для того, чтобы вектора были построены наиболее эффективным способом, нам надо очистить текст от "мусора", а также привести его в унифицированный вид.
Ниже представлены функции для обработки текста:

In [3]:
def text_lower(text):
    return text.lower()

def text_remove_whitespaces(text):
    return text.strip()

def text_stem(text):
    return " ".join([stemmer.stem(word) for word in tokenizer.tokenize(text)])

def text_lemm(text):
    return " ".join([lemmatizer.lemmatize(word) for word in tokenizer.tokenize(text)])

def text_remove_stopwords(text):
    stoplist = stopwords.words('english')
    text_no_stopwords = [word for word in tokenizer.tokenize(text) if word not in stoplist]
    return " ".join(text_no_stopwords)

def text_remove_punct(text):
    text_no_punct = text.translate(str.maketrans('', '', string.punctuation))
    return text_no_punct


def text_remove_nonalph(text):
    return " ".join([word for word in tokenizer.tokenize(text) if word.isalpha()])

def text_required_pos(text):
    tokens = tokenizer.tokenize(text)
    tokens_tagged = nltk.pos_tag(tokens)
    taglist = ['NN', 'NNS', 'NNP', 'NNPS', 'FW', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    text_onlypos = [x[0] for x in tokens_tagged if x[1] in taglist]
    return " ".join(text_onlypos)


Сложим все эти функции в одну, ее то мы и будем использовать для обработки каждого описания товара.

In [4]:
def text_preprocess(text):
    text = text_lower(text)
    text = text_remove_whitespaces(text)
    text = text_remove_punct(text)
    text = text_remove_stopwords(text)
    #text = text_stem(text)
    text = text_lemm(text)
    text = text_remove_nonalph(text)
    text = text_required_pos(text)
    return text

## Подготовка данных
В задаче данные подают в виде csv файла, нам нужно из него получить датасет для обучения и тестирования.
Ниже представлена функция открывающая конвертирующая csv файл в датафрейм из pandas, далее его мы уже будем представлять в виде датасета.

In [5]:
def prepare_data(csv_path):
    data = pd.read_csv(csv_path)
    data.dropna(inplace=True)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    data.columns = ['category', 'description']
    category_dict = {'Household': 0, 'Books': 1, 'Clothing & Accessories': 2, 'Electronics' : 3}
    data.replace({'category' : category_dict}, inplace=True)
    return data


In [6]:
data = prepare_data('ecommerceDataset.csv')

Обработаем описания товаров, сохраним отдельно входы (описание товара) и выходы (категория товара).

In [7]:
x = list(data['description'].apply(text_preprocess))
y = list(data['category'])

Разобьем наши данные на тренировочные и тестовые.

In [8]:
data_size = len(data)
indexs = [idx for idx in range(data_size)]
train_indices, test_indices = [], []
np.random.shuffle(indexs)
train_size = int(data_size * 0.8)
for i in range(data_size):
    if i < train_size:
        train_indices.append(indexs[i])
    else:
        test_indices.append(indexs[i])

In [9]:
x_train = [x[idx] for idx in train_indices]
y_train = [y[idx] for idx in train_indices]
x_test = [x[idx] for idx in test_indices]
y_test = [y[idx] for idx in test_indices]

Наконец, конвертируем обработанные текст-данные в вектора.

In [10]:
vect_x_train = vectorizer.fit_transform(x_train)
vect_x_test = vectorizer.transform(x_test)
vect_x_train = vect_x_train.toarray()
vect_x_test = vect_x_test.toarray()

In [11]:
inputlen = len(vectorizer.get_feature_names_out())

In [12]:
class EcommerceDataset(Dataset):
    def __init__(self, x_vectorized, y):
        self.x_vectorized = x_vectorized
        self.y = y
        
    def __getitem__(self, index):
        return self.x_vectorized[index], self.y[index]
    
    def __len__(self):
        return len(self.x_vectorized)

In [16]:
train_dataset = EcommerceDataset(vect_x_train, y_train)
test_dataset = EcommerceDataset(vect_x_test, y_test)
trainval_size = len(train_dataset)
val_size = int(trainval_size * 0.15)
train_size = trainval_size - val_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

In [17]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle = True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle = False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle = False)

## Модель
Для модели классификации текста за основу выберем DenseNet.

In [18]:
class MyDenseNet(nn.Module):
    
    def __init__(self):
        super(MyDenseNet, self).__init__()
        self.fc1 = nn.Linear(inputlen,1024)
        self.fc2 = nn.Linear(1024,256)
        self.prediction = nn.Linear(256,4)
        
    def forward(self,x):
        x = x.to(torch.float)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.log_softmax(self.prediction(x),dim=1)
        
        return x

In [19]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)

In [20]:
device = torch.device(device)

In [21]:
model = MyDenseNet().to(device)

### Обучение модели и тестирование

In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=1e-4)

Напишем функцию тестирования и обучения модели:

In [23]:
def evaluate(model, device, dataloader, loss_fn):

    losses = []

    num_correct = 0
    num_elements = 0
    with torch.no_grad():
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model(x_batch)
            loss = loss_fn(outputs, y_batch)
            losses.append(loss.item())
            y_pred = torch.argmax(outputs, dim=1)
            num_correct += torch.sum(y_pred == y_batch).item()
            num_elements += y_batch.size(0)

    accuracy = round(100 * num_correct / num_elements, 2)
    return accuracy, np.mean(losses)

In [24]:
def train(model, train_loader, device, loss_fn, optimizer, n_epoch=6):
    for epoch in range(1, n_epoch + 1):
        #epoch_loss = []
        #epoch_true = 0
        #epoch_total = 0
        for x_batch, y_batch in train_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            
            outputs = model(x_batch)
            
            loss = loss_fn(outputs,y_batch)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            #epoch_loss.append(loss.item())
            #epoch_true += torch.sum(y_pred == y_batch).item()
            #epoch_total += y_batch.size(0)
            #y_pred = torch.argmax(outputs,dim=1)
        
        #epoch_accuracy = round(100 * epoch_true / epoch_total, 2)
        epoch_accuracy, epoch_loss = evaluate(model, device, val_loader, loss_fn)
        print(f"Epoch {epoch}/{n_epoch} finished: train_accuracy = {epoch_accuracy}%, train_loss = {np.mean(epoch_loss)}")
        
    
    return model

In [25]:
model = train(model, train_loader, device, criterion, optimizer, n_epoch=6)

Epoch 1/6 finished: train_accuracy = 93.74%, train_loss = 0.2726059578499704
Epoch 2/6 finished: train_accuracy = 94.63%, train_loss = 0.19439372799868854
Epoch 3/6 finished: train_accuracy = 95.14%, train_loss = 0.18123840393041665
Epoch 4/6 finished: train_accuracy = 95.38%, train_loss = 0.17818583850309533
Epoch 5/6 finished: train_accuracy = 95.41%, train_loss = 0.18597044366991744
Epoch 6/6 finished: train_accuracy = 95.17%, train_loss = 0.1945536150333454


Протестируем:

In [26]:
test_accuracy, test_loss = evaluate(model, device, test_loader, criterion)
print(f'Testing finished: Accuracy =  {test_accuracy}%, Loss = {test_loss}')

Testing finished: Accuracy =  95.29%, Loss = 0.20141273951050878


## Заключение
Мы получили оригинальную модель, которая работает хорошо, если к нам добавляют только продукты из описанных выше категорий :)

In [27]:
mydict = {'Household': 0, 'Books': 1, 'Clothing & Accessories': 2, 'Electronics' : 3}
def predict_single(text):
    text = text_preprocess(text)
    x = vectorizer.transform([text])
    x = torch.sparse_coo_tensor(x.nonzero(), x.data, x.shape)
    pred = model(x)
    pred = pred.argmax(axis=1)
    return list(mydict.keys())[list(mydict.values()).index(pred[0])]

In [28]:
print(f'Prediction data\'s category based on desription is: {predict_single(data["description"][0])} ({data["description"][0][:39]}...) ')

Prediction data's category based on desription is: Household (Paper Plane Design Framed Wall Hanging ...) 
