# **Подсчет количества пар товаров в продуктовых чеках**

# Вариант 1. Python

In [1]:
import pandas as pd
import itertools

In [2]:
df = pd.read_csv('/content/data_two_columns.csv')
df.head()

Unnamed: 0,invoiceno,stockcode
0,536365,85123A
1,536365,71053
2,536365,84406B
3,536365,84029G
4,536365,84029E


In [3]:
df_group = df.groupby(by=['invoiceno'])['stockcode'].apply(set).reset_index(name='list_stockcode')
df_group.head()

Unnamed: 0,invoiceno,list_stockcode
0,536365,"{85123A, 22752, 71053, 84406B, 84029E, 84029G,..."
1,536366,"{22632, 22633}"
2,536367,"{21777, 22745, 84969, 48187, 21754, 22749, 217..."
3,536368,"{22912, 22960, 22914, 22913}"
4,536369,{21756}


In [4]:
%%time
dict_couple_products_amount = {}

for current_set in df_group['list_stockcode']:
  if len(current_set)>1:
    list_combinations = [tuple(sorted(_)) for _ in list(itertools.combinations(current_set, 2))]
    for element_list_combinations in list_combinations:
      if dict_couple_products_amount.get(element_list_combinations) is None:
        dict_couple_products_amount[element_list_combinations] = 1
      else:
        dict_couple_products_amount[element_list_combinations] = dict_couple_products_amount[element_list_combinations] + 1

CPU times: user 37.4 s, sys: 345 ms, total: 37.7 s
Wall time: 37.8 s


In [5]:
df_final = pd.DataFrame(dict_couple_products_amount.items(),columns=['couple_stockcode','count'])
df_final = df_final.sort_values(by=['count'], ascending=False)
df_final.head()

Unnamed: 0,couple_stockcode,count
11618,"(22386, 85099B)",833
84747,"(22697, 22699)",784
6119,"(21931, 85099B)",733
17336,"(22411, 85099B)",683
7438,"(20725, 22383)",663


# Вариант 2. Spark

In [6]:
%%capture
!pip install pyspark

In [7]:
import itertools
from typing import List

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pyspark.sql.types as T
from pyspark.sql.functions import udf, col, size,explode

In [8]:
spark = SparkSession.builder.master('local[*]').appName("CoupleCodes").getOrCreate()

In [9]:
df = spark.read.format('csv') \
                             .options(inferSchema='true', delimiter=',', header='true') \
                             .load('/content/data_two_columns.csv')

In [10]:
df.toPandas().head()

Unnamed: 0,invoiceno,stockcode
0,536365,85123A
1,536365,71053
2,536365,84406B
3,536365,84029G
4,536365,84029E


In [11]:
df_group = df.groupBy('invoiceno').agg(F.collect_set('stockcode').alias('list_stockcode'))

In [12]:
df_group.toPandas().head()

Unnamed: 0,invoiceno,list_stockcode
0,536366,"[22632, 22633]"
1,536367,"[22310, 22622, 21755, 22623, 84879, 84969, 217..."
2,536371,[22086]
3,536374,[21258]
4,536375,"[84029E, 21730, 82483, 82482, 20679, 71053, 37..."


In [13]:
df_group = df_group.filter(size(df_group.list_stockcode)>1)
df_group.show()

+---------+--------------------+
|invoiceno|      list_stockcode|
+---------+--------------------+
|   536366|      [22632, 22633]|
|   536367|[22310, 22622, 21...|
|   536375|[84029E, 21730, 8...|
|   536377|      [22632, 22633]|
|   536384|[21324, 82484, 22...|
|   536385|[22663, 22961, 85...|
|   536386|[85099B, 84880, 8...|
|   536387|[21731, 22466, 22...|
|   536389|[22941, 22191, 22...|
|   536392|[22502, 21889, 84...|
|   536394|[22866, 22865, 85...|
|   536395|[21212, 22867, 22...|
|   536396|[84029E, 21730, 8...|
|   536398|[22449, 48185, 22...|
|   536399|      [22632, 22633]|
|   536402|[22910, 22837, 22...|
|   536403|       [22867, POST]|
|   536404|[21212, 22805, 85...|
|   536407|      [22632, 22633]|
|   536408|[21212, 22813, 84...|
+---------+--------------------+
only showing top 20 rows



In [14]:
def create_combinations_stockcode(list_stockcode:List)->List: 
 list_combinations = list(itertools.combinations(sorted(list_stockcode), 2))
 return [str(i[0])+', '+str(i[1]) for i in list_combinations]

In [15]:
combinations_stockcode_udf = udf(lambda x: create_combinations_stockcode(x),T.ArrayType(T.StringType()))

In [16]:
df_group_combinations = df_group.withColumn("combinations_stockcode", combinations_stockcode_udf(col("list_stockcode")))

In [18]:
%%time
df_group_combinations.select(explode(col('combinations_stockcode')).alias("couple_stockcode")) \
        .groupBy('couple_stockcode') \
        .count() \
        .sort(col("count").desc()) \
        .toPandas() \
        .head()

CPU times: user 17.5 s, sys: 584 ms, total: 18.1 s
Wall time: 2min 43s


Unnamed: 0,couple_stockcode,count
0,"22386, 85099B",833
1,"22697, 22699",784
2,"21931, 85099B",733
3,"22411, 85099B",683
4,"20725, 22383",663


# Вариант 3. PostgreSQL

In [19]:
%%capture
# Install postgresql server
!sudo apt-get -y -qq update
!sudo apt-get -y -qq install postgresql
!sudo service postgresql start

# Setup a password `postgres` for username `postgres`
!sudo -u postgres psql -U postgres -c "ALTER USER postgres PASSWORD 'postgres';"

# Setup a database with name `tfio_demo` to be used
!sudo -u postgres psql -U postgres -c 'DROP DATABASE IF EXISTS db;'
!sudo -u postgres psql -U postgres -c 'CREATE DATABASE db;'

In [20]:
from sqlalchemy import create_engine

In [21]:
user = 'postgres'
password = 'postgres'
host = 'localhost'
post = 5432
name = 'db'

point = 'postgresql://{}:{}@{}:{}/{}'.format(user, password, host, post, name)

con = create_engine(point)

  """)


In [22]:
df_sql = pd.read_csv('/content/data_two_columns.csv')
df_sql.to_sql("sales", con, if_exists='replace', index=False, method='multi')

In [23]:
def select_postgresql(sql):
    return pd.read_sql(sql, con)

In [24]:
sql = """select s.* from sales as s limit 10"""

In [25]:
select_postgresql(sql).head()

Unnamed: 0,invoiceno,stockcode
0,536365,85123A
1,536365,71053
2,536365,84406B
3,536365,84029G
4,536365,84029E


In [26]:
sql = """with tbl_no_duplicates as (select s.invoiceno, 
                                           s.stockcode 
                                    from sales as s 
                                    group by s.invoiceno, 
                                             s.stockcode),
      
              tbl_list_code_combinations as (select t.invoiceno, (select array_agg(concat(cast(t1.* as text),', ',cast(t2.* as text)))
                                                                  from unnest(array_agg(t.stockcode)) as t1 
                                                                                     cross join unnest(array_agg(t.stockcode)) as t2
                                                                  where t1.* < t2.*) as agg
                                             from tbl_no_duplicates as t
                                            group by t.invoiceno)

select unnest(t.agg) as couple_stockcode, count(*)
from tbl_list_code_combinations as t
 group by  unnest(t.agg) 
 order by count(*) desc
 limit 10"""

In [27]:
%%time
select_postgresql(sql).head()

CPU times: user 210 ms, sys: 25.5 ms, total: 235 ms
Wall time: 45.9 s


Unnamed: 0,couple_stockcode,count
0,"22386, 85099B",833
1,"22697, 22699",784
2,"21931, 85099B",733
3,"22411, 85099B",683
4,"20725, 22383",663
