# **Подсчет количества пар товаров в продуктовых чеках**

# Вариант 1. Python

In [1]:
import pandas as pd
import itertools
import hashlib
from tqdm import tqdm_notebook

In [2]:
df = pd.read_csv('/content/data_two_columns.csv')
df.head()

Unnamed: 0,invoiceno,stockcode
0,536365,85123A
1,536365,71053
2,536365,84406B
3,536365,84029G
4,536365,84029E


In [3]:
df_group = df.groupby(by=['invoiceno'])['stockcode'].apply(set).reset_index(name='list_stockcode')
df_group.head()

Unnamed: 0,invoiceno,list_stockcode
0,536365,"{21730, 84406B, 71053, 84029E, 85123A, 22752, ..."
1,536366,"{22632, 22633}"
2,536367,"{22623, 21777, 22749, 22622, 84879, 48187, 227..."
3,536368,"{22960, 22914, 22913, 22912}"
4,536369,{21756}


In [4]:
%%time
dict_couple_products_hash = {} 
dict_couple_products_amount = {}

for current_set in df_group['list_stockcode']:
  list_combinations = list(itertools.combinations(current_set, 2))
  for element_list_combinations in list_combinations:
      list_sorted = sorted(element_list_combinations)
      str_sorted = ", ".join(list_sorted)
      hash_object = hashlib.md5(str_sorted.encode('utf-8'))
      hash_code = hash_object.hexdigest()
      if dict_couple_products_hash.get(hash_code) is None:
        dict_couple_products_hash[hash_code] = list_sorted
        dict_couple_products_amount[hash_code] = 1
      else:
        dict_couple_products_amount[hash_code] = dict_couple_products_amount[hash_code] + 1

CPU times: user 1min 25s, sys: 2 s, total: 1min 27s
Wall time: 1min 29s


In [6]:
%%time
list_couple_products = []
list_amount = []

for current_key in dict_couple_products_hash.keys():
  list_couple_products.append(dict_couple_products_hash.get(current_key))
  list_amount.append(dict_couple_products_amount.get(current_key))

CPU times: user 4.59 s, sys: 20.4 ms, total: 4.61 s
Wall time: 5.15 s


In [7]:
dict_data = {'two_products':list_couple_products, 'amount':list_amount}
df_final = pd.DataFrame(data=dict_data)
df_final = df_final.sort_values(by=['amount'], ascending=False)
df_final.head()

Unnamed: 0,two_products,amount
10862,"[22386, 85099B]",833
40238,"[22697, 22699]",784
5653,"[21931, 85099B]",733
17321,"[22411, 85099B]",683
7810,"[20725, 22383]",663


# Вариант 2. Spark

In [8]:
%%capture
!pip install pyspark

In [9]:
import itertools
from typing import List

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, size,explode

In [10]:
spark = SparkSession.builder.master('local[*]').appName("CoupleCodes").getOrCreate()

In [11]:
df = spark.read.format('csv') \
                             .options(inferSchema='true', delimiter=',', header='true') \
                             .load('/content/data_two_columns.csv')

In [13]:
df.toPandas().head()

Unnamed: 0,invoiceno,stockcode
0,536365,85123A
1,536365,71053
2,536365,84406B
3,536365,84029G
4,536365,84029E


In [14]:
df_group = df.groupBy('invoiceno').agg(F.collect_set('stockcode').alias('list_stockcode')).cache()

In [15]:
df_group.toPandas().head()

Unnamed: 0,invoiceno,list_stockcode
0,536596,"[22900, 22114, 84926A, 22802, 21624, 21967]"
1,536938,"[22112, 21931, 84997A, 21258, 22386, 22837, 20..."
2,537252,[22197]
3,537691,"[22505, 46000R, 22147, 22470, 84580, 84755, 22..."
4,538041,[22145]


In [16]:
def create_combinations_stockcode(list_stockcode:List)->List: 
 list_combinations = list(itertools.combinations(sorted(list_stockcode), 2))
 return [str(i[0])+', '+str(i[1]) for i in list_combinations]

In [17]:
combinations_stockcode_udf = udf(lambda x: create_combinations_stockcode(x),T.ArrayType(T.StringType()))

In [21]:
df_group_combinations = df_group.withColumn("combinations_stockcode", combinations_stockcode_udf(col("list_stockcode")))
df_group_combinations = df_group_combinations.withColumn("size_array", size(df_group_combinations.combinations_stockcode))
df_final = df_group_combinations.filter(df_group_combinations.size_array>0).select(df_group_combinations.combinations_stockcode)

In [23]:
%%time
df_final.select(explode(col('combinations_stockcode')).alias("combination_stockcode")) \
        .groupBy('combination_stockcode') \
        .count() \
        .sort(col("count").desc()) \
        .toPandas() \
        .head()

CPU times: user 23.3 s, sys: 1.97 s, total: 25.3 s
Wall time: 4min 49s


Unnamed: 0,combination_stockcode,count
0,"22386, 85099B",833
1,"22697, 22699",784
2,"21931, 85099B",733
3,"22411, 85099B",683
4,"20725, 22383",663


# Вариант 3. PostgreSQL

In [24]:
%%capture
# Install postgresql server
!sudo apt-get -y -qq update
!sudo apt-get -y -qq install postgresql
!sudo service postgresql start

# Setup a password `postgres` for username `postgres`
!sudo -u postgres psql -U postgres -c "ALTER USER postgres PASSWORD 'postgres';"

# Setup a database with name `tfio_demo` to be used
!sudo -u postgres psql -U postgres -c 'DROP DATABASE IF EXISTS db;'
!sudo -u postgres psql -U postgres -c 'CREATE DATABASE db;'

In [25]:
from sqlalchemy import create_engine

In [26]:
user = 'postgres'
password = 'postgres'
host = 'localhost'
post = 5432
name = 'db'

point = 'postgresql://{}:{}@{}:{}/{}'.format(user, password, host, post, name)

con = create_engine(point)

  """)


In [27]:
df_sql = pd.read_csv('/content/data_two_columns.csv')
df_sql.to_sql("sales", con, if_exists='replace', index=False, method='multi')

In [28]:
def select_postgresql(sql):
    return pd.read_sql(sql, con)

In [29]:
sql = """select s.* from sales as s limit 10"""

In [30]:
select_postgresql(sql).head()

Unnamed: 0,invoiceno,stockcode
0,536365,85123A
1,536365,71053
2,536365,84406B
3,536365,84029G
4,536365,84029E


In [31]:
sql = """with tbl_no_duplicates as (select s.invoiceno, 
                                           s.stockcode 
                                    from sales as s 
                                    group by s.invoiceno, 
                                             s.stockcode),
      
              tbl_list_code_combinations as (select t.invoiceno, (select array_agg(concat(cast(t1.* as text),', ',cast(t2.* as text)))
                                                                  from unnest(array_agg(t.stockcode)) as t1 
                                                                                     cross join unnest(array_agg(t.stockcode)) as t2
                                                                  where t1.* < t2.*) as agg
                                             from tbl_no_duplicates as t
                                            group by t.invoiceno)

select unnest(t.agg) as couple_stockcode, count(*)
from tbl_list_code_combinations as t
 group by  unnest(t.agg) 
 order by count(*) desc
 limit 10"""

In [33]:
%%time
select_postgresql(sql).head()

CPU times: user 315 ms, sys: 47.6 ms, total: 363 ms
Wall time: 58.8 s


Unnamed: 0,couple_stockcode,count
0,"22386, 85099B",833
1,"22697, 22699",784
2,"21931, 85099B",733
3,"22411, 85099B",683
4,"20725, 22383",663
