In [33]:
library(bigrquery)

# Store the project id
projectid = "ceska-televize"

# Set your query
sql <- "SELECT eventDate, deviceID, sessionID, eventLabel from L1.CT_HITS_FACT where eventDate >= '2020-05-01' and eventCategory = 'Video' and eventAction='Prehrat'"

# Run the query and store the data in a dataframe
df <- query_exec(sql, projectid, use_legacy_sql = FALSE, allow_large_results = TRUE, max_pages=Inf)

# Print the query result
nrow(df)

0 bytes processed



In [34]:
library(dplyr)
library(tidyr)

names(df) <- c("date","user","session","porad") 
data <- df %>% group_by(user,porad) %>%
  summarise(
     shlednuti=length(porad)
  ) %>%
  arrange(desc(shlednuti))


head(data)



user,porad,shlednuti
<chr>,<chr>,<int>
2130452816.1573567,Sametová revoluce,434
625748391.1589549,Méďové | Méďové se rodí v lednu,382
434504640.1531561,Herbář | Herbář VII,371
2130452816.1573567,Václav III.,347
697703908.1574501,UčíTelka | Český jazyk,326
300753701.1555777,Výživa,319


In [35]:

datam <- data[1:10000,] %>% spread(porad, shlednuti)

datam

user,1. Newtonův zákon,10 let od ničivé tsunami v Indonésii (2004),100 let od pandemie španělské chřipky,13. komnata | 13. komnata Daniela Landy,13. komnata | 13. komnata Michala Davida,168 hodin | Neděle 10. května,168 hodin | Neděle 16. června 2019,168 hodin | Neděle 17. května,168 hodin | Neděle 24. května,⋯,Znehodnocení peněz,Zprávy | Pondělí 11. května,Zprávy v českém znakovém jazyce | Zprávy ve znakové řeči,Zrádci | Všechno bude OK (6/6),Zrádci | Ze života pozůstalých (3/6),Zrak,Ztracená brána | 3/3,Zuby: Dentální hygiena,Zvuk,她教 | 捷克语
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1000089823.1586371805,,,,,,,,,,⋯,,,,,,,,,,
1000258059.1587452450,,,,,,,,,,⋯,,,,,,,,,,
1000382949.1585159734,,,,,,,,,,⋯,,,,,,,,,,
1001038010.1508251126,,,,,,,,,,⋯,,,,,,,,,,
1001075286.1590577080,,,,,,,,,,⋯,,,,,,,,,,
1001245213.1588498494,,,,,,,,,,⋯,,,,,,,,,,
1001385987.1522166989,,,,,,,,,,⋯,,,,,,,,,,
100147631.1547663562,,,,,,,,,,⋯,,,,,,,,,,
1001833701.1590931493,,,,,,,,,,⋯,,,,,,,,,,
100183888.1441304812,,,,,,,,,,⋯,,,,,,,,,,


In [36]:
library("arules")
library("recommenderlab")

In [37]:

# Create a temporary directory

 
# Write our data.frame to a csv
write.csv(data[,c(1,2)], "/tmp/tall_transactions.csv")
 
# Read that csv back in
relations <- read.transactions(
        file = "/tmp/tall_transactions.csv",
        format = "single",
        sep = ",",
        cols=c("user","porad"),
        rm.duplicates = T,
        header=TRUE
)
summary(relations)


transactions as itemMatrix in sparse format with
 911987 rows (elements/itemsets/transactions) and
 56152 columns (items) and a density of 4.926816e-05 

most frequent items:
                        Polopatě | Polopatě 
                                      31766 
Všechno, co mám ráda | Všechno, co mám ráda 
                                      21701 
                       Anatomie zrady | 2/2 
                                      16306 
                     UčíTelka | Český jazyk 
                                      15254 
              168 hodin | Neděle 10. května 
                                      14637 
                                    (Other) 
                                    2423353 

element (itemset/transaction) length distribution:
sizes
     1      2      3      4      5      6      7      8      9     10     11 
565547 134618  62143  35311  23215  16865  11859   9705   7352   5781   4693 
    12     13     14     15     16     17     18     19     20     21  

In [39]:
rules <- apriori(relations, parameter = list(support = 0.001, confidence = 0.6))

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.6    0.1    1 none FALSE            TRUE       5   0.001      1
 maxlen target  ext
     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 911 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[56152 item(s), 911987 transaction(s)] done [1.57s].
sorting and recoding items ... [513 item(s)] done [0.05s].
creating transaction tree ... done [0.54s].
checking subsets of size 1 2 3 4 5 6 7 8 9 done [0.34s].
writing ... [53104 rule(s)] done [0.01s].
creating S4 object  ... done [0.37s].


In [40]:
inspect(head(rules, n = 50, by = "confidence"))

     lhs                     rhs                      support confidence    coverage     lift count
[1]  {Labyrint | 1/7,                                                                              
      Labyrint | 2/7,                                                                              
      Labyrint | 3/7,                                                                              
      Labyrint | 5/7,                                                                              
      Labyrint | 6/7}     => {Labyrint | 4/7}     0.001006593  0.9967427 0.001009883 603.5965   918
[2]  {Labyrint | 1/7,                                                                              
      Labyrint | 2/7,                                                                              
      Labyrint | 3/7,                                                                              
      Labyrint | 5/7}     => {Labyrint | 4/7}     0.001083349  0.9959677 0.001087735 603.1272   988


In [65]:
library(stringr)

data$serie <- str_split_fixed(data$porad,"\\|",2)[,1]
head(data)


user,porad,shlednuti,serie
<chr>,<chr>,<int>,<chr>
2130452816.1573567,Sametová revoluce,434,Sametová revoluce
625748391.1589549,Méďové | Méďové se rodí v lednu,382,Méďové
434504640.1531561,Herbář | Herbář VII,371,Herbář
2130452816.1573567,Václav III.,347,Václav III.
697703908.1574501,UčíTelka | Český jazyk,326,UčíTelka
300753701.1555777,Výživa,319,Výživa


In [66]:


data_serie <- data %>% group_by(user,serie) %>%
  summarise(
     shlednuti=sum(shlednuti)
  ) %>%
  arrange(desc(shlednuti))


head(data_serie)

user,serie,shlednuti
<chr>,<chr>,<int>
1381557243.1562097,AZ-kvíz,940
697703908.1574501,UčíTelka,707
277334440.1502122,AZ-kvíz,657
1255427073.158533,Události v kultuře,653
1531236250.158842,AZ-kvíz,650
593345317.1572793,Buly hokej živě,634


In [67]:

 
# Write our data.frame to a csv
write.csv(data_serie[,c(1,2)], "/tmp/tall_serie.csv")
 
# Read that csv back in
relations <- read.transactions(
        file = "/tmp/tall_serie.csv",
        format = "single",
        sep = ",",
        cols=c("user","serie"),
        rm.duplicates = T,
        header=TRUE
)
summary(relations)

transactions as itemMatrix in sparse format with
 911987 rows (elements/itemsets/transactions) and
 8555 columns (items) and a density of 0.0002008416 

most frequent items:
           Události            168 hodin             Polopatě  
               43012                41373                31766 
Případy 1. oddělení             UčíTelka               (Other) 
               29446                26666              1394713 

element (itemset/transaction) length distribution:
sizes
     1      2      3      4      5      6      7      8      9     10     11 
679453 117086  45625  22738  13152   8562   5789   4207   3055   2272   1711 
    12     13     14     15     16     17     18     19     20     21     22 
  1366   1026    887    732    521    448    421    323    291    241    215 
    23     24     25     26     27     28     29     30     31     32     33 
   178    166    149    139     94     78     81     78     68     68     75 
    34     35     36     37     38     39   

In [71]:
rules <- apriori(relations, parameter = list(support = 0.0005, confidence = 0.5))

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.5    0.1    1 none FALSE            TRUE       5   5e-04      1
 maxlen target  ext
     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 455 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[8555 item(s), 911987 transaction(s)] done [0.73s].
sorting and recoding items ... [465 item(s)] done [0.03s].
creating transaction tree ... done [0.77s].
checking subsets of size 1 2 3 4 done [0.03s].
writing ... [27 rule(s)] done [0.00s].
creating S4 object  ... done [0.34s].


In [72]:
inspect(head(rules, n = 50, by = "confidence"))

     lhs                                 rhs                              support confidence     coverage      lift count
[1]  {Labyrint ,                                                                                                         
      Labyrint III }                  => {Labyrint II. }             0.0008388277  0.8351528 0.0010044003 319.34949   765
[2]  {Otázky Václava Moravce ,                                                                                           
      Reportéři ČT ,                                                                                                     
      Události, komentáře }           => {168 hodin }                0.0005230338  0.7871287 0.0006644832  17.35072   477
[3]  {Studio ČT24 ,                                                                                                      
      Události za okamžik a počasí }  => {Události }                 0.0005394814  0.7500000 0.0007193085  15.90231   492
[4]  {168 hodin ,       