## CLUSTERING:

### IMPORT MODULES:

In [1]:
import os
import re
import csv
import nltk
import mpld3
import codecs
import collections

import math as mh
import numpy as np
import pandas as pd
import string as st
import networkx as nx
import matplotlib as mpl
import matplotlib.pyplot as plt

from time import time
from itertools import chain
from itertools import islice
from sklearn.cluster import KMeans
from sklearn import feature_extraction
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/deniz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/deniz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### READ CSVS:

In [2]:
with open("../data/vkusvill_items.csv", 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    catalog_df = pd.DataFrame(reader)

catalog_df = catalog_df.rename(columns = {"": "shop_id", "item_composition_txt": "item_composition", "nutrion_value_txt" : "nutrion_value"})
catalog = catalog_df.drop(["shop_id", "price", "vat", "measure_unit", "measure_value", "protein_value", "fat_value", "carb_value", 
                           "energy_value", "measure_quantum"], axis=1)

In [3]:
categories_df = pd.read_csv("../data/vkusvill_categories.csv")
categories_df = categories_df.rename( columns = {"Unnamed: 0": "shop_id"})

### Prepare data:

In [4]:
def check_categorie(df, categorie):
    result = pd.DataFrame();
    
    for line in range(len(df.index)):
        if str(categorie) in df["categories_array"][line]:
            df_string = pd.DataFrame({"item_id": [df["item_id"][line]], "categories_array": [df["categories_array"][line]],
                                     "item_name": [df["item_name"][line]], "item_composition": [df["item_composition"][line]],
                                     "nutrion_value": [df["nutrion_value"][line]]})
            
            result = pd.concat([result, df_string], ignore_index=True)

    return result
    
result = check_categorie(catalog, 13152)
result = result.drop(["categories_array", "item_name", "nutrion_value"], axis=1)

In [5]:
result

Unnamed: 0,item_id,item_composition
0,43499,"мясо кролика (50%), мясо индейки (50%)"
1,61936,"мясо кур, мясо индеек, мясо кролика."
2,62798,говядина
3,62804,Телятина\nПродукция производится на предприяти...
4,62806,"Свинина, говядина"
5,62809,Говядина\nПродукция производится на предприяти...
6,69576,"Свинина (50%), говядина (50%)\nПроизводится на..."
7,71029,"Говядина, филе грудки куриной\nПродукция произ..."
8,20336,мясо кролика
9,21194,ЯРОСЛАВСКИЙ БРОЙЛЕР АО: мясо цыплят-бройлеров ...


### PAIRS ITEM NAMES:

In [6]:
def clear_item_name(item_name):
    answer = ""
    
    for symbol in item_name:
        if symbol == '_':
            answer += ' '
            continue  
        elif symbol in [
            '\\', 
            '(', 
            ')', 
            '.', 
            ',', 
            '%'
        ] or symbol.isdigit():
            answer += ' '
            continue
        else:
            answer += symbol

    answer = answer.replace(" шт", ' ')
    answer = answer.replace(" мл", ' ')
    answer = answer.replace(" мг", ' ')
    answer = answer.replace(" мг", ' ')
    answer = answer.replace(" премиум", ' ')
    answer = answer.replace(" вес", ' ')
    answer = answer.replace(" из ", ' ')
    answer = answer.replace(" спб ", ' ')
    answer = answer.replace(" по ", ' ')
    answer = answer.replace(" вкусвилл ", ' ')
    answer = answer.replace(" стандарту ", ' ')
    answer = answer.replace(" вв", ' ')
    answer = answer.replace(" и ", ' ')
    answer = answer.replace(" постный ", ' ')
    answer = answer.replace(" мяса ", ' ')
    answer = answer.replace(" охлажденный", ' ')
    answer = answer.replace(" филе", ' ')
    
    while "  " in answer:
        answer = answer.replace("  ", ' ')
    
    return answer.lower()

In [7]:
prepare_catalog_df = pd.Series()

for i in range(len(result.index)):
    for i in range(10):
        tmp = clear_item_name(result["item_composition"][i])
        
    prepare_catalog_df = pd.concat([prepare_catalog_df, pd.Series(tmp[:-1])], ignore_index=True)

prepare_catalog_df

final_result = pd.concat([result, prepare_catalog_df], ignore_index=True, axis=1)
final_result = final_result.drop([1], axis=1)
final_result = final_result.rename( columns = {0: "item_id", 2: "clear_composition"})

In [8]:
final_result

Unnamed: 0,item_id,clear_composition
0,43499,ярославский бройлер ао: мясо цыплят-бройлеров ...
1,61936,ярославский бройлер ао: мясо цыплят-бройлеров ...
2,62798,ярославский бройлер ао: мясо цыплят-бройлеров ...
3,62804,ярославский бройлер ао: мясо цыплят-бройлеров ...
4,62806,ярославский бройлер ао: мясо цыплят-бройлеров ...
5,62809,ярославский бройлер ао: мясо цыплят-бройлеров ...
6,69576,ярославский бройлер ао: мясо цыплят-бройлеров ...
7,71029,ярославский бройлер ао: мясо цыплят-бройлеров ...
8,20336,ярославский бройлер ао: мясо цыплят-бройлеров ...
9,21194,ярославский бройлер ао: мясо цыплят-бройлеров ...


### CLUSTERING:

In [9]:
print(str(len(final_result)) + ' запросов считано')

stemmer = SnowballStemmer("russian")

def token_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    
    for token in tokens:
        if re.search('[а-яА-Я]', token):
            filtered_tokens.append(token)
    
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def token_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    
    for token in tokens:
        if re.search('[а-яА-Я]', token):
            filtered_tokens.append(token)
    
    return filtered_tokens

#Создаем словари (массивы) из полученных основ
totalvocab_stem = []
totalvocab_token = []

for i in range(len(result.index)):
    allwords_stemmed = token_and_stem(final_result["clear_composition"][i])
    # print(allwords_stemmed)
    totalvocab_stem.extend(allwords_stemmed)    
    allwords_tokenized = token_only(final_result["clear_composition"][i])
    totalvocab_token.extend(allwords_tokenized)

24 запросов считано


In [10]:
stopwords = nltk.corpus.stopwords.words('russian')
#можно расширить список стоп-слов
stopwords.extend(['как', 'в', 'к', 'на', "из", "по", 'г', "вв", "вкуссвилл", "вес", "спб"])

n_featur=2000000

tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=10000,
                                 min_df=0.15, stop_words=stopwords,
                                 use_idf=True, tokenizer=token_and_stem, ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(final_result["clear_composition"])
get_ipython().magic('time tfidf_matrix = tfidf_vectorizer.fit_transform(final_result["clear_composition"])')
print(tfidf_matrix.shape)



ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.

In [None]:
num_clusters = 20

# Метод к-средних - KMeans

km = KMeans(n_clusters=num_clusters)
get_ipython().magic('time km.fit(tfidf_matrix)')
idx = km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

# # MiniBatchKMeans
# from sklearn.cluster import MiniBatchKMeans

# mbk  = MiniBatchKMeans(init='random', n_clusters=num_clusters) #(init='k-means++', ‘random’ or an ndarray)
# mbk.fit_transform(tfidf_matrix)
# %time mbk.fit(tfidf_matrix)
# miniclusters = mbk.labels_.tolist()
# print (mbk.labels_)

# # DBSCAN
# from sklearn.cluster import DBSCAN
# get_ipython().magic('time db = DBSCAN(eps=0.3, min_samples=10).fit(tfidf_matrix)')
# labels = db.labels_
# labels.shape
# print(labels)

# # Аггломеративная класстеризация
# from sklearn.cluster import AgglomerativeClustering

# agglo1 = AgglomerativeClustering(n_clusters=num_clusters, affinity='euclidean') #affinity можно выбрать любое или попробовать все по очереди: cosine, l1, l2, manhattan
# get_ipython().magic('time answer = agglo1.fit_predict(tfidf_matrix.toarray())')
# answer.shape

In [None]:
print(clusters)
print (km.labels_)

In [None]:
#k-means
clusterkm = km.labels_.tolist()
# #minikmeans
# clustermbk = mbk.labels_.tolist()
# #dbscan
# clusters3 = labels
#  #agglo
# #clusters4 = answer.tolist()

frame = pd.DataFrame(final_result["clear_name"], index = [clusterkm])

#k-means
out = { 'title': range(len(result.index)), 'cluster': clusterkm }
frame1 = pd.DataFrame(out, index = [clusterkm], columns = ['title', 'cluster'])
frame1 = frame1.reset_index(drop=True)

# #mini
# out = { 'title': final_result["clear_name"], 'cluster': clustermbk }
# frame_minik = pd.DataFrame(out, index = [clustermbk], columns = ['title', 'cluster'])

# frame1['cluster'].value_counts()

dict = {}

for i in range(len(frame1.index)):
    dict[i] = frame1['cluster'][i]   

dict = pd.Series(dict)
dict

final = pd.concat([final_result, dict], ignore_index= True, axis=1)

final = final.rename(columns = {0: "item_id", 1: "categories", 2: "composition", 3: "nutriotion", 4: "clear_name", 5: "cluster"})
# result
# frame_minik['cluster'].value_counts()
final = final.sort_values(by='cluster')

In [None]:
final