In [1]:
import numpy as np
import pandas as pd
import warnings
from nerus import load_nerus
import nerus
from nltk import word_tokenize
import nltk
from ipymarkup import show_span_ascii_markup
from scipy import stats
import string
from os.path import dirname, basename, isfile, join
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path  
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import glob
from sklearn.model_selection import train_test_split
import re

from sklearn.pipeline import Pipeline
# pipeline позволяет объединить в один блок трансформер и модель, что упрощает написание кода и улучшает его читаемость
from sklearn.feature_extraction.text import TfidfVectorizer
# TfidfVectorizer преобразует тексты в числовые вектора, отражающие важность использования каждого слова из некоторого набора слов (количество слов набора определяет размерность вектора) в каждом тексте
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
# линейный классификатор и классификатор методом ближайших соседей
from sklearn import metrics
# набор метрик для оценки качества модели
from sklearn.model_selection import GridSearchCV
# модуль поиска по сетке параметров

stemmer = SnowballStemmer("russian") 
spec_chars = string.punctuation + '\n\xa0«»\t—…' 
russian_stopwords = stopwords.words("russian")
df_seller_price1 = pd.read_excel("df_seller_price1.xlsx")

def intersection_list(list1, list2): 
    return list(set(list1) & set(list2))

def remove_chars_from_text(text, chars = spec_chars):
    return "".join([ch for ch in text if ch not in chars]) 
warnings.simplefilter(action='ignore', category=FutureWarning)

def find_pop_country(df, n = 5):
    most_pop = df['country_code'].value_counts()
    n = min(len(most_pop), n)
    most_pop = pd.DataFrame(most_pop)
    most_pop['imp'] = most_pop['country_code'] / most_pop['country_code'].sum()
    most_pop = most_pop.reset_index()
    most_pop.rename(columns = { 'country_code' : 'quantity'}, inplace = True)
    most_pop.rename(columns = { 'index' : 'country_code'}, inplace = True)
    if len([i for i in df['country_code'] if i == -1]):
        most_pop['imp'][most_pop['country_code'] == -1] = 0
    return most_pop[0:n]

def get_normal_form(text):
    text = text.lower()
    text = remove_chars_from_text(text, spec_chars)
    text = text.split()
    text = [word for word in text if word not in russian_stopwords]
    text = ' '.join(text)
    text = re.sub(r"\d",'', text) # Подумать, нужно ли удаление чисел
    return text

def str_corpus(corpus):
    str_corpus = ''
    for i in corpus:
        str_corpus += ' ' + i
    str_corpus = str_corpus.strip()
    return str_corpus
# Получение списка всех слов в корпусе
def get_corpus(data):
    corpus = []
    for phrase in data:
        for word in phrase.split():
            corpus.append(word)
    return corpus
column_to_ml = 'product_name'
# Приведение столбца в нормальную форму
def clean_data(df_whole,column_to_ml = column_to_ml):
    df_whole[column_to_ml] = df_whole[column_to_ml].map(lambda x: x.lower())
    df_whole[column_to_ml] = df_whole[column_to_ml].map(lambda x: remove_chars_from_text(x, spec_chars))
    df_whole[column_to_ml] = df_whole[column_to_ml].map(lambda x: x.split())
    df_whole[column_to_ml] = df_whole[column_to_ml].map(lambda words: [word for word in words if word not in russian_stopwords])
    df_whole[column_to_ml] = df_whole[column_to_ml].map(lambda x: ' '.join(x))
    df_whole[column_to_ml] = df_whole[column_to_ml].map(lambda x: re.sub(r"\d",'', x))
    return df_whole
#============================================================================================================================

In [2]:
# Считывание дата сетов
df_conract = pd.read_csv("data/Контракты 44ФЗ.csv", sep = ";")
df_dict = pd.read_csv("data/Справочник пром производства.csv", sep = ";")
df_seller_price = pd.read_csv("data/Ценовые предложения поставщиков.csv", sep = ";").astype({'country_code':'object'})
df_whole = df_conract.append(df_dict).append(df_seller_price)

In [3]:
def learn_model(df_whole, column_to_ml):
    df_whole['main_category'] = df_whole['okpd2_code'].apply(lambda x:x.split('.')[0])
    df_whole = df_whole.dropna(subset = [column_to_ml])
    df_whole = df_whole[[column_to_ml, 'main_category']]
    df_whole = clean_data(df_whole)
    # Создание тренировочных данных
    X_train, X_valid, y_train, y_valid = train_test_split(df_whole[column_to_ml], df_whole ['main_category'], test_size=0.2, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    sgd_ppl_clf = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('sgd_clf', SGDClassifier(random_state=42))])
    sgd_ppl_clf.fit(X_train, y_train)
    predicted_sgd = sgd_ppl_clf.predict(X_test)
    return sgd_ppl_clf

In [4]:
def make_predict(model, name, df):
    df['main_category'] = df['okpd2_code'].apply(lambda x:x.split('.')[0])
    sug = model.predict([name])
    return df[df['main_category'] == sug[0]].head()

In [5]:
# df_seller_price1 датасет с обработанными данными
def exact_search(name, df_seller_price1, df_normal = df_seller_price):
    text = get_normal_form(name)
    text = text.split()

    df_seller_price1['product_name'] = df_seller_price1['product_name'].str.lower()
    df_seller_price1['country_code'] =  df_seller_price1['country_code'].fillna(-1)
    df_seller_price1['country_code'] = df_seller_price1['country_code'].astype('str').str.split("|")
    all_find = df_seller_price1.explode('country_code')
    i = 0
    # Проверка наличия цены
    for name in text:
        goods_with_price = all_find.shape[0] - all_find['price'].isna().sum()
        if i == 0:
            all_find = all_find[all_find['product_name'].str.contains(name)]
            all_find = all_find[~all_find['price'].isna()] 
            all_find = all_find.dropna()
            county_asses = find_pop_country(all_find) # Самые популярные страны
            all_find = all_find.merge(county_asses, on = 'country_code')
            all_find['score'] = all_find['imp'] + (1 - all_find['price']/ all_find['price'].max())
        elif goods_with_price > 0:
            all_find['score'][all_find['product_name'].str.contains(name)] *= 3
        i += 1        
    all_find['score'][all_find['country_code'] == "643.0" ] *= 1.5  # Россия в этом слове огонь и сила
    all_find = all_find.sort_values(by = 'score', ascending = False)
    all_find = df_normal.loc[all_find['Unnamed: 0']]
    return all_find.head(10)


In [6]:
model = learn_model(df_whole, column_to_ml)

In [11]:
def make_final_df(name, model, df_seller_price, df_seller_price1):
    ml_pred = make_predict(model, name, df_seller_price, )
    df_exact = exact_search(name, df_seller_price1)
    df_res = ml_pred.append(df_exact)
    df_res = df_res[['product_name', 'price', 'inn', 'country_code']]
    return df_res.head(20)
#make_final_df("стол дуб aboba", model, df_seller_price, df_seller_price1)

In [8]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1200)

In [None]:
from tkinter import *
from tkinter import messagebox

def poisk(x):
    test=text.insert(
        'end',
        f"\nРезультаты поиска:\n {make_final_df(x.get(), model, df_seller_price, df_seller_price1)}")
    return test
def Start():
    messagebox.showinfo(
        "Подтвердите данные","Выполнить поиск товара  " + str(
            x.get()))
    Start = poisk(x)
    
def exit_app():
    root.destroy()
root = Tk()
root.title("ZakupkiHack")
#Задаем размер главного окна и его расположение
root.geometry('1200x600+400+400')
#Зафиксировать размер окна
root.resizable(width=False, height=False)
#Задем тип переменных
x = StringVar()

#Коффицент
label1 = Label(text="Поиск")
#Расположение элемента с наименованием
label1.place(x=50,y=10)
#
#Задание поля ввода
message_entry = Entry(textvariable=x)
#Расположение поля ввода
message_entry.place(x=100,y=10)

text = Text(width=300, height=150)
text.place(x=10,y=50)
text.insert('end', u'zakupki')

#Расположение кнопки и ее обновление
Start_build = Button(text="ОК", command=Start)

#Расположение кнопки обработки
Start_build.place(x=245,y=8)
root.mainloop()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_find['score'][all_find['product_name'].str.contains(name)] *= 3
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_find['score'][all_find['product_name'].str.contains(name)] *= 3
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_find['score'][all_find['country_code'] == "643.0" ] *= 1.5  # Россия в этом слове огонь и сила
