## CATEGORIES CLASSIFICATION:

### DATA LISTS:

In [1]:
base_eating_categories = [8, 10000, 10007, 10027, 10036, 10039, 10046, 10050, 10060, 10061, 10062, 10096, 10126,
                          10252, 10679, 11136, 11839, 11847, 11861, 11868, 12700, 13806, 14138, 50018, 50104, 50106,
                          50111, 50129, 50138, 50158, 50159, 50198, 50244, 50254, 50273, 50277, 50285, 50289, 50293, 
                          50305, 50308, 51179, 51397, 54059] 

eating_categories = [6, 50070, 50313, 13, 50312]
#----------------------------------------------------------------------------------------------------------------------------------------------
base_noneating_categories = [10107, 10134, 10141, 10370, 10500, 10843, 50144, 50155]

noneating_categories = [50081, 2939, 50313, 2939, 50015]
#----------------------------------------------------------------------------------------------------------------------------------------------
base_intermediate_categories = [10, 14, 3033, 10022, 10158, 10259, 10421, 10486, 14518, 14808, 50102]

intermediate_categories = [54191, 50271]

### IMPORT MODULES:

In [2]:
import csv
import collections

import math as mh
import numpy as np
import pandas as pd
import string as st
import networkx as nx
import matplotlib.pyplot as plt

from time import time
from itertools import chain
from itertools import islice

### READ CSVS:

In [3]:
with open("../data/vkusvill_items.csv", 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    catalog_df = pd.DataFrame(reader)

catalog_df = catalog_df.rename(columns = {"": "shop_id", "item_composition_txt": "item_composition", "nutrion_value_txt" : "nutrion_value"})

In [4]:
categories_df = pd.read_csv("../data/vkusvill_categories.csv")
categories_df = categories_df.rename( columns = {"Unnamed: 0": "shop_id"})

### FIND BASE CATEGORIES:

In [5]:
def find_base_categories(categories_df):
    base_categories = []
    
    for i in range(len(categories_df.index)):
        if pd.isna(categories_df.iloc[i]["categorie_parent_id"]):
            base_categories.append(categories_df.iloc[i]["categorie_id"])

    base_categories = sorted(base_categories)
    return base_categories

base_categories = find_base_categories(categories_df)

In [6]:
base_categories[:10]

[10, 14, 3033, 10000, 10007, 10022, 10027, 10036, 10039, 10046]

### FIND USED CATEGORIES:

In [7]:
def clear_categorie_string(catergorie_string):
    answer = ""
    
    for i in catergorie_string:
        if i in ['[', ']', ' ', '"']:
            continue
        elif i == ',':
            answer += ' '
        else:
            answer += i

    return answer

In [8]:
def find_used_categories(catalog_df):
    used_categories = []
    
    for i in catalog_df["categories_array"]:
        used_categories.extend(clear_categorie_string(i).split())

    s_used_categories = set(used_categories)
    return [int(x) for x in s_used_categories]

used_categories = sorted(find_used_categories(catalog_df))

In [9]:
used_categories[:10]

[8, 12, 2929, 2930, 2931, 2937, 2940, 2941, 2942, 2943]

### ADD NAMES TO BASE CATEGORIES:

In [10]:
def find_categorie_name(categories_df, categorie_id):
    for i in range(len(categories_df.index)):
        if categories_df.iloc[i]["categorie_id"] == categorie_id:
            return categories_df.iloc[i]["categorie_name"]

In [11]:
def find_full_base_categories(categories_df, base_categories):
    full_base_categories = {}
    
    for i in base_categories:
        full_base_categories[i] = find_categorie_name(categories_df, i)

    return full_base_categories

full_base_categories = find_full_base_categories(categories_df, base_categories)

In [12]:
collections.OrderedDict(sorted(islice(full_base_categories.items(), 10)))

OrderedDict([(10, 'Новинки'),
             (14, 'Аптека'),
             (3033, 'Добрые товары'),
             (10000, 'Молочные продукты, яйцо'),
             (10007, 'Сладости и десерты'),
             (10022, 'Детское питание, гигиена и развитие'),
             (10027, 'Напитки'),
             (10036, 'Хлеб и выпечка'),
             (10039, 'Мясо, птица, шашлык'),
             (10046, 'Колбасы и мясные деликатесы')])

### FIND CATEGORIES PAIRS:

In [13]:
def find_categorie_pairs(categories_df):
    categorie_pairs = {}
    
    for i in range(len(categories_df.index)):
        if categories_df.iloc[i][1] not in categorie_pairs and categories_df.iloc[i][1] in used_categories:
            categorie_pairs[categories_df.iloc[i][1]] = categories_df.iloc[i][2]

    return categorie_pairs

categorie_pairs = find_categorie_pairs(categories_df)

In [14]:
collections.OrderedDict(sorted(islice(categorie_pairs.items(), 10)))

OrderedDict([(8, 6.0),
             (12, 6.0),
             (2943, 6.0),
             (2949, 6.0),
             (2957, 6.0),
             (2960, 6.0),
             (3029, 6.0),
             (50197, 6.0),
             (50203, 6.0),
             (54191, 14.0)])

In [15]:
def find_final_categorie_pair(categories_df, categorie):
    for i in range(len(categories_df.index)):
        if categorie == categories_df.iloc[i][1]:
            answer = categorie_pairs[categorie]
            
            while answer in categorie_pairs:
                answer = categorie_pairs[answer]
                
            return answer

In [16]:
def find_final_categorie_pairs(categories_df):
    final_categorie_pairs = {}
    
    for i in range(len(categories_df.index)):
        if categories_df.iloc[i][1] not in final_categorie_pairs and categories_df.iloc[i][1] in used_categories:
            final_categorie_pairs[categories_df.iloc[i][1]] = find_final_categorie_pair(categories_df, categories_df.iloc[i][1])

    return final_categorie_pairs

final_categorie_pairs = find_final_categorie_pairs(categories_df)

In [17]:
collections.OrderedDict(sorted(islice(final_categorie_pairs.items(), 10)))

OrderedDict([(8, 6.0),
             (12, 6.0),
             (2943, 6.0),
             (2949, 6.0),
             (2957, 6.0),
             (2960, 6.0),
             (3029, 6.0),
             (50197, 6.0),
             (50203, 6.0),
             (54191, 14.0)])

### ADD NAMES TO CATEGORIES PAIRS:

In [18]:
def find_full_final_categorie_pairs(categories_df, final_categorie_pairs):
    full_final_categorie_pairs = {}
    
    for key in final_categorie_pairs:
        categories_with_names = []
        categories_with_names.append(str(final_categorie_pairs[key]))
        categories_with_names.append(find_categorie_name(categories_df, key))
        categories_with_names.append(find_categorie_name(categories_df, final_categorie_pairs[key]))
        full_final_categorie_pairs[key] = categories_with_names

    return full_final_categorie_pairs
    
full_final_categorie_pairs = find_full_final_categorie_pairs(categories_df, final_categorie_pairs)

In [19]:
collections.OrderedDict(sorted(islice(full_final_categorie_pairs.items(), 10)))

OrderedDict([(8, ['6.0', 'Витаминно-минеральные комплексы', 'Витамины, БАДы']),
             (12, ['6.0', 'Витамин D', 'Витамины, БАДы']),
             (2943, ['6.0', 'Витамин C', 'Витамины, БАДы']),
             (2949, ['6.0', 'Витамин E', 'Витамины, БАДы']),
             (2957, ['6.0', 'Витамин B', 'Витамины, БАДы']),
             (2960, ['6.0', 'Гематоген, батончики', 'Витамины, БАДы']),
             (3029, ['6.0', 'Витамин A', 'Витамины, БАДы']),
             (50197, ['6.0', 'Дерматология', 'Витамины, БАДы']),
             (50203, ['6.0', 'Гепатопротекторы', 'Витамины, БАДы']),
             (54191, ['14.0', 'Аптечные товары со скидкой', 'Аптека'])])

### CLASSIFICATION ITEMS:

In [20]:
def check_categorie(categories, categorie_id):
    if categorie_id in full_final_categorie_pairs:
        return True

    return False

In [21]:
def check_categories(categories_list, categories):
    for categorie in categories:
        if check_categorie(categories_list, categorie):
            return True

    return False

In [22]:
def find_item_categories(item_categories):
    categories = []
    categories.extend(clear_categorie_string(item_categories).split())
    return [int(x) for x in categories]

used_categories = sorted(find_item_categories(catalog_df.iloc[1]["categories_array"]))

In [23]:
eating_count = 0
noneating_count = 0
intermediate_count = 0
undefined_count = 0
categories_count = 0
nutrion_count = 0

def is_eating(catalog_df, categories_df, item_id, noneating_categories, intermediate_categories, eating_categories):

    global eating_count
    global noneating_count
    global intermediate_count
    global undefined_count
    global categories_count
    global nutrion_count
    
    index = catalog_df[catalog_df["item_id"] == str(item_id)]["nutrion_value"].index[0]
    item_categories = find_item_categories(catalog_df.iloc[index]["categories_array"])

    if not catalog_df[catalog_df["item_id"] == str(item_id)]["nutrion_value"][index] == "":
        eating_count += 1
        nutrion_count += 1
        return "Eating"
    elif check_categories(noneating_categories, item_categories):
        categories_count += 1
        noneating_count += 1
        return "Noneating"
    elif check_categories(intermediate_categories, item_categories):
        categories_count += 1
        intermediate_count += 1
        return "Unknow"
    elif check_categories(eating_categories, item_categories):
        categories_count += 1
        eating_count += 1
        return "Eating"
    else:
        undefined_count += 1
        return "Undefined"

In [24]:
target = pd.Series()

def binary_classification_products(catalog_df, categories_df, target):
    for i in range(len(catalog_df.index)):
        target = pd.concat([target, pd.Series(is_eating(catalog_df, categories_df, catalog_df.iloc[i]["item_id"], 
                                                        noneating_categories, intermediate_categories, eating_categories))], 
                           ignore_index=True)

    catalog_df = pd.concat([catalog_df.item_name, target], axis=1, ignore_index=True)
    return catalog_df

classification = binary_classification_products(catalog_df, categories_df, target)
classification_with_data = pd.concat([catalog_df, classification], axis = 1)
classification_with_data = classification_with_data.rename(columns = {1 : "eating"})
classification_with_data = classification_with_data[["item_id", "item_name", "item_composition", "eating"]]

In [25]:
def number_to_percents(all, part):
    return round(part * 100 / all, 3)

In [26]:
def calculate_result(classification_with_data, eating_count, noneating_count, intermediate_count, 
                    undefined_count, categories_count, nutrion_count):
                        
    all = classification_with_data.shape[0]
    print(f"All items: {all}")
    print(f"Eating products: {number_to_percents(all, eating_count)} %")
    print(f"Noneating products: {number_to_percents(all, noneating_count)} %")
    print(f"Intermediate products: {number_to_percents(all, intermediate_count)} %")
    print(f"Undefined products: {number_to_percents(all, undefined_count)} %") 
    print(f"Classified products by nutrion: {number_to_percents(all, nutrion_count)} %")
    print(f"Classified products by categories: {number_to_percents(all, categories_count)} %")
    
calculate_result(classification_with_data, eating_count, noneating_count, intermediate_count, 
                    undefined_count, categories_count, nutrion_count)

All items: 11418
Eating products: 57.234 %
Noneating products: 42.766 %
Intermediate products: 0.0 %
Undefined products: 0.0 %
Classified products by nutrion: 57.234 %
Classified products by categories: 42.766 %


In [27]:
step = 55
count = 0

classification_with_data[step * count:step * (count + 1)]

Unnamed: 0,item_id,item_name,item_composition,eating
0,38960,Конфета жевательная с ароматом клубники,"сахар, патока крахмальная карамельная фермента...",Eating
1,38962,Смесь овощная \Три капусты\(Продукт замороженный),"цветная капуста, капуста брокколи, капуста ром...",Eating
2,38973,"Чипсы из морской капусты с анчоусами, 20 г","анчоусы, морская капуста сушёная (ламинария по...",Eating
3,38977,Салат свекольный с грецким орехом,"свекла столовая отварная, майонез м. д. ж. 50 ...",Eating
4,38983,"Кета малосоленая филе-ломтики, 150 г","филе кеты, соль пищевая.\nПродукт может содерж...",Eating
5,38984,"Тунец холодного копчения карпаччо с кунжутом, ...","Тунец желтоперый, семена кунжута, масло оливко...",Eating
6,38995,Филе окорочка цыпленка-бройлера с овощами,"филе окорочка цыплят-бройлеров охл., овощи бла...",Eating
7,38999,Нут жареный,"нут жареный (нут свежий), масло подсолнечное р...",Eating
8,39006,Хлебцы ржаные с паприкой,"Мука ржаная сеяная, вода питьевая, мука ржаная...",Eating
9,39015,Пирожное \Ежик\,"сметана м. д. ж. 25 % (сливки нормализованные,...",Eating
