# Подсчет частотности пар товаров в продуктовых чеках

В файле содержится информация о покупках людей.

* id – означает покупку (в одну покупку входят все товары, купленные пользователем во время 1 похода в магазин)
* Товар – наименование товара
* Количество – число единиц купленного товара

Воспользуйтесь этими данными и выясните, какие пары товаров пользователи чаще всего покупают вместе. По сути, вам необходимо найти паттерны покупок, что позволит оптимизировать размещение продуктов в магазине, для удобства пользователей и увеличения выручки.

# **Худший вариант по производительности. Не получилось провести необходимые вычисления на всем массиве данных за разумный отрезок времени!**

In [1]:
%%capture
# Install postgresql server
!sudo apt-get -y -qq update
!sudo apt-get -y -qq install postgresql
!sudo service postgresql start

# Setup a password `postgres` for username `postgres`
!sudo -u postgres psql -U postgres -c "ALTER USER postgres PASSWORD 'postgres';"

# Setup a database with name `tfio_demo` to be used
!sudo -u postgres psql -U postgres -c 'DROP DATABASE IF EXISTS db;'
!sudo -u postgres psql -U postgres -c 'CREATE DATABASE db;'

In [2]:
import pandas as pd
from sqlalchemy import create_engine

In [3]:
user = 'postgres'
password = 'postgres'
host = 'localhost'
post = 5432
name = 'db'

In [4]:
point = 'postgresql://{}:{}@{}:{}/{}'.format(user, password, host, post, name)

In [5]:
con = create_engine(point)

  """)


In [167]:
df_test = pd.read_csv('/content/data_two_columns.csv')

In [168]:
df_test = df_test.iloc[:1000]

In [169]:
df_test.to_sql("sales_test", con, if_exists='replace', index=False, method='multi')

In [170]:
def select_postgresql(sql):
    return pd.read_sql(sql, con)

In [171]:
sql = """select s.* from sales_test as s limit 10"""

In [172]:
print(select_postgresql(sql))

  invoiceno stockcode
0    536365    85123A
1    536365     71053
2    536365    84406B
3    536365    84029G
4    536365    84029E
5    536365     22752
6    536365     21730
7    536366     22633
8    536366     22632
9    536367     84879


In [173]:
sql = """with tbl_stockcode as (select distinct s.stockcode from sales_test as s),

              tbl_couple_codes as (select t1.stockcode as stockcode1, 
                                          t2.stockcode as stockcode2
                                   from tbl_stockcode as t1 cross join tbl_stockcode as t2 
                                   where t1.stockcode < t2.stockcode),

              tbl_group_codes as (select t.list_stockcode
                                  from (select s.invoiceno, array_agg(distinct s.stockcode) as list_stockcode
                                       from sales_test as s
                                       group by s.invoiceno) as t)

              select t1.stockcode1, t1.stockcode2, count(t2.list_stockcode)
              from tbl_couple_codes as t1 left join tbl_group_codes as t2
                                                   on t1.stockcode1 = any(t2.list_stockcode) 
                                                      and t1.stockcode2 = any(t2.list_stockcode)
              where t2.list_stockcode is not null
              group by t1.stockcode1, t1.stockcode2
              order by count(t2.list_stockcode) desc,
                      t1.stockcode1, t1.stockcode2
              limit 10;"""

In [174]:
%%time
print(select_postgresql(sql))

  stockcode1 stockcode2  count
0      22632      22633      8
1     84029G     85123A      6
2      21730      22752      5
3      21730      71053      5
4      21730     84029E      5
5      21730     84029G      5
6      21730     84406B      5
7      21730     85123A      5
8      22752      71053      5
9      22752     84029E      5
CPU times: user 44.9 ms, sys: 5.3 ms, total: 50.2 ms
Wall time: 6.05 s


In [175]:
sql = """  DROP FUNCTION func_count(code1 text, code2 text);
           CREATE OR REPLACE FUNCTION func_count(code1 text, code2 text)
                        RETURNS bigint AS
                 $BODY$
                        	select count(s.invoiceno)						 
                            from   (select s.invoiceno 
				                            from sales_test  as s
				                            where s.stockcode = code1 or s.stockcode = code2
				                            group by s.invoiceno
				                            having count(distinct s.stockcode)>=2) as s;
                 $BODY$
                       LANGUAGE 'sql';

          with tbl_stockcode_unique  as (select distinct s.stockcode
      							                    from sales_test as s),
	             tbl_stockcode_couple as (select t1.stockcode as stockcode1,
							                                t2.stockcode as stockcode2
							                        from tbl_stockcode_unique as t1 cross join tbl_stockcode_unique as t2
							                       where t1.stockcode < t2.stockcode)

        select t.stockcode1, 
              t.stockcode2, 
              func_count(t.stockcode1, t.stockcode2) as total_amount
        from  tbl_stockcode_couple as t
        order by  func_count(t.stockcode1, t.stockcode2) desc,
                  t.stockcode1, t.stockcode2            
        limit 10"""

In [176]:
%%time
print(select_postgresql(sql))

  stockcode1 stockcode2  total_amount
0      22632      22633             8
1     84029G     85123A             6
2      21730      22752             5
3      21730      71053             5
4      21730     84029E             5
5      21730     84029G             5
6      21730     84406B             5
7      21730     85123A             5
8      22752      71053             5
9      22752     84029E             5
CPU times: user 138 ms, sys: 17.6 ms, total: 156 ms
Wall time: 26.2 s
