In [1]:
#!pip install sqlalchemy pandas mlxtend


# Notebook for apriori

## code

In [2]:
from sqlalchemy import create_engine, text, MetaData, Table, select, and_
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

from mlxtend.frequent_patterns import apriori


class V30:
    _e = None
    _conn = None
    _matadata = None
    _te = TransactionEncoder()
    
    _t = {}
    
    def __init__(self, file_path):
        # relative path
        self._engine = create_engine(f'sqlite:///{file_path}')
        self._metadata = MetaData()
        self._conn = self._engine.connect()
        self.__init_tables()
        
        
    def __init_tables(self):
        q = text("SELECT name FROM sqlite_master WHERE type='table'")
        res = self._conn.execute(q).all()
       
        
        tables = [str(r[0]) for r in res]
        for t in tables:
            self._t[t] = Table(t, self._metadata, autoload_with=self._engine)
        
   
    def execute_text(self, q):
        return self._conn.execute(text(q))
    
    
    def execute(self, stmt):
        return self._conn.execute(stmt)
    
    
    
    def get_transactions(self, verb, verb_compound, columns=[], skip_deprels=[], include_deprels=[]):
        """
        Fetches transactions from the database and returns them as an array of dictionaries, each representing a transaction.

        Parameters:
        - verb (str): The main verb to filter transactions by.
        - verb_compound (str): Additional compound verb information for filtering.
        - columns (list of str, optional): Specifies which columns to include in each returned dictionary. If empty, all columns are included.
        - skip_deprels (list of str, optional): Dependency relations to exclude from the results.
        - include_deprels (list of str, optional): Dependency relations to include in the results. 
            If both skip_deprels and include_deprels are provided, include_deprels takes precedence.

        Returns:
        - list of list of dicts: A list where each dictionary represents a transaction, 
        structured according to the specified 'columns', or all transaction data 
        if 'columns' is empty or not provided. Transactions are grouped by 'head_id'.
        """
        
        
        TransactionHead = self._t['transaction_head']
        Transaction = self._t['transaction']
        
        
        where_filters = [TransactionHead.c.verb == verb]
        where_filters.append(TransactionHead.c.verb_compound == verb_compound)
        
        if isinstance(skip_deprels, list) and len(skip_deprels):
            where_filters.append(Transaction.c.deprel.notin_(skip_deprels))
        
        if isinstance(include_deprels, list) and len(include_deprels):
            where_filters.append(Transaction.c.deprel.in_(include_deprels))  
    
        # maybe some transaction_head field should be also includes in results eg TransactionHead.deprel
        selections = [Transaction]
        
        stmt = select(*selections)\
            .join(TransactionHead, TransactionHead.c.id == Transaction.c.head_id)\
            .where(and_(*where_filters))\
            .order_by(Transaction.c.head_id, Transaction.c.loc)
            
        transactions = {}
        for res in self.execute(stmt).mappings():
            res = dict(res)
            # group by transaction_head
            if res["head_id"] not in transactions:
                transactions[res["head_id"]] = []
                
            r_dict = dict(res)

            if columns:
                for key in list(r_dict.keys()):
                    if key not in columns:
                        del r_dict[key]
                        
            transactions[res["head_id"]].append(r_dict)
            
        return list(transactions.values())
    
   
    def dict_to_apriori(self, transactions):
        """
        Converts transaction data into a format suitable for Apriori algorithm processing.

        Parameters:
        - transactions (list of list of dicts): The transaction data, where each transaction is a list of row dictionaries.

        Returns:
        - list of list of tuples: A dataset where each transaction is represented as a list of tuples, with each tuple containing the row values.
        """
        return  [ [ tuple(row.values()) for row in tr] for tr in transactions ]
        
    
    def apriori(self, dataset, min_support=0.5, use_colnames=True):
        """
        Applies the Apriori algorithm on the dataset to find frequent itemsets based on a minimum support threshold.

        Parameters:
        - dataset (list of list of items): The transaction dataset for itemset generation.
        - min_support (float, optional): The minimum support threshold for itemsets to be considered frequent. Default is 0.5.
        - use_colnames (bool, optional): Indicates whether to use column names for itemset generation. Default is True.

        Displays:
        - A DataFrame of frequent itemsets sorted by their support values in descending order.
        - The transformed dataset DataFrame used for Apriori algorithm.
        """
        
        print('min_support:', min_support)
        te_ary = self._te.fit(dataset).transform(dataset)
        df = pd.DataFrame(te_ary, columns=self._te.columns_)
        display(apriori(df, min_support=min_support, use_colnames=use_colnames).sort_values('support', ascending=False))
        display(df)
        
    

## Examples

In [3]:
v30 = V30("v30_koondkorpus_sentences_verb_pattern_obl_20240327-194533.db")

  self._t[t] = Table(t, self._metadata, autoload_with=self._engine)
  self._t[t] = Table(t, self._metadata, autoload_with=self._engine)
  self._t[t] = Table(t, self._metadata, autoload_with=self._engine)
  self._t[t] = Table(t, self._metadata, autoload_with=self._engine)


### alla kirjutama  **deprel** ja **feats** põhjal

In [4]:


# transaktsioonid alist of lists of dicts
transactions = v30.get_transactions(verb='kirjutama', verb_compound='alla', columns=['deprel','feats'])
print('transactions')
display(transactions[:3])

# alist of lists of dicts teisendatakse apriori algoritmi jaoks sobivale kujule -list of lists of tuples
dataset = v30.dict_to_apriori(transactions)
print('dataset')
display(dataset[:3])

# rakendatakse aprioi algoritm, tulemus prinditakse välja ekraanile
v30.apriori(dataset, min_support=0.05)



transactions


[[{'deprel': 'obl', 'feats': 'all,com,sg'},
  {'deprel': 'aux', 'feats': 'af,aux,indic,pres,ps,ps3,sg'},
  {'deprel': 'compound:prt', 'feats': ''},
  {'deprel': 'nsubj', 'feats': 'com,nom,sg'},
  {'deprel': 'punct', 'feats': ''}],
 [{'deprel': 'obl', 'feats': 'com,gen,sg'},
  {'deprel': 'nsubj', 'feats': 'nom,prop,sg'},
  {'deprel': 'compound:prt', 'feats': ''},
  {'deprel': 'obl', 'feats': 'all,com,sg'},
  {'deprel': 'punct', 'feats': ''}],
 [{'deprel': 'punct', 'feats': ''},
  {'deprel': 'nsubj', 'feats': 'com,nom,pl'},
  {'deprel': 'obl', 'feats': 'com,in,sg'},
  {'deprel': 'compound:prt', 'feats': ''},
  {'deprel': 'obl', 'feats': 'all,com,pl'}]]

dataset


[[('obl', 'all,com,sg'),
  ('aux', 'af,aux,indic,pres,ps,ps3,sg'),
  ('compound:prt', ''),
  ('nsubj', 'com,nom,sg'),
  ('punct', '')],
 [('obl', 'com,gen,sg'),
  ('nsubj', 'nom,prop,sg'),
  ('compound:prt', ''),
  ('obl', 'all,com,sg'),
  ('punct', '')],
 [('punct', ''),
  ('nsubj', 'com,nom,pl'),
  ('obl', 'com,in,sg'),
  ('compound:prt', ''),
  ('obl', 'all,com,pl')]]

min_support: 0.05


Unnamed: 0,support,itemsets
2,0.958303,"((compound:prt, ))"
16,0.896548,"((punct, ))"
35,0.858334,"((punct, ), (compound:prt, ))"
11,0.683522,"((obl, all,com,sg))"
30,0.657131,"((compound:prt, ), (obl, all,com,sg))"
...,...,...
29,0.053309,"((compound:prt, ), (obl, all,com,pl))"
14,0.052359,"((obl, com,in,sg))"
32,0.051832,"((compound:prt, ), (obl, com,gen,sg))"
51,0.051198,"((punct, ), (obl, com,gen,sg))"


Unnamed: 0,"(advcl, )","(advcl, abes,aux,ps,sup)","(advcl, abes,main,ps,sup)","(advcl, abes,mod,ps,sup)","(advcl, ad,com,sg)","(advcl, ad,nominal,sg)","(advcl, ad,pl)","(advcl, ad,sg)","(advcl, af,aux,cond,past,ps)","(advcl, af,aux,cond,pres,ps)",...,"(xcomp, aux,inf)","(xcomp, com,sg,tr)","(xcomp, ill,main,ps,sup)","(xcomp, ill,mod,ps,sup)","(xcomp, inf,main)","(xcomp, inf,mod)","(xcomp, main,partic,past,ps)","(xcomp, mod,partic,past,ps)","(xcomp, pos)","(xcomp, pos,sg,tr)"
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9468,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9469,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9470,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9471,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### alla kirjutama  **deprel** ja **feats** põhjal, filtreeritakse välja deprel = 'punct', 'cc'

In [5]:


# transaktsioonid alist of lists of dicts
transactions = v30.get_transactions(verb='kirjutama', verb_compound='alla', columns=['deprel','feats'], skip_deprels=['punct', 'cc'])
print('transactions')
display(transactions[:3])

# alist of lists of dicts teisendatakse apriori algoritmi jaoks sobivale kujule -list of lists of tuples
dataset = v30.dict_to_apriori(transactions)
print('dataset')
display(dataset[:3])

# rakendatakse aprioi algoritm, tulemus prinditakse välja ekraanile
v30.apriori(dataset, min_support=0.05)



transactions


[[{'deprel': 'obl', 'feats': 'all,com,sg'},
  {'deprel': 'aux', 'feats': 'af,aux,indic,pres,ps,ps3,sg'},
  {'deprel': 'compound:prt', 'feats': ''},
  {'deprel': 'nsubj', 'feats': 'com,nom,sg'}],
 [{'deprel': 'obl', 'feats': 'com,gen,sg'},
  {'deprel': 'nsubj', 'feats': 'nom,prop,sg'},
  {'deprel': 'compound:prt', 'feats': ''},
  {'deprel': 'obl', 'feats': 'all,com,sg'}],
 [{'deprel': 'nsubj', 'feats': 'com,nom,pl'},
  {'deprel': 'obl', 'feats': 'com,in,sg'},
  {'deprel': 'compound:prt', 'feats': ''},
  {'deprel': 'obl', 'feats': 'all,com,pl'}]]

dataset


[[('obl', 'all,com,sg'),
  ('aux', 'af,aux,indic,pres,ps,ps3,sg'),
  ('compound:prt', ''),
  ('nsubj', 'com,nom,sg')],
 [('obl', 'com,gen,sg'),
  ('nsubj', 'nom,prop,sg'),
  ('compound:prt', ''),
  ('obl', 'all,com,sg')],
 [('nsubj', 'com,nom,pl'),
  ('obl', 'com,in,sg'),
  ('compound:prt', ''),
  ('obl', 'all,com,pl')]]

min_support: 0.05


Unnamed: 0,support,itemsets
1,0.958303,"((compound:prt, ))"
10,0.683522,"((obl, all,com,sg))"
26,0.657131,"((compound:prt, ), (obl, all,com,sg))"
4,0.395967,"((nsubj, com,nom,sg))"
20,0.380239,"((nsubj, com,nom,sg), (compound:prt, ))"
34,0.29526,"((nsubj, com,nom,sg), (obl, all,com,sg))"
43,0.284809,"((nsubj, com,nom,sg), (compound:prt, ), (obl, ..."
0,0.245751,"((advmod, ))"
15,0.238678,"((compound:prt, ), (advmod, ))"
8,0.194342,"((obl, ad,com,sg))"


Unnamed: 0,"(advcl, )","(advcl, abes,aux,ps,sup)","(advcl, abes,main,ps,sup)","(advcl, abes,mod,ps,sup)","(advcl, ad,com,sg)","(advcl, ad,nominal,sg)","(advcl, ad,pl)","(advcl, ad,sg)","(advcl, af,aux,cond,past,ps)","(advcl, af,aux,cond,pres,ps)",...,"(xcomp, aux,inf)","(xcomp, com,sg,tr)","(xcomp, ill,main,ps,sup)","(xcomp, ill,mod,ps,sup)","(xcomp, inf,main)","(xcomp, inf,mod)","(xcomp, main,partic,past,ps)","(xcomp, mod,partic,past,ps)","(xcomp, pos)","(xcomp, pos,sg,tr)"
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9468,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9469,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9470,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9471,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### ärkama  **deprel** ja **lemma** põhjal, filtreeritakse välja deprel = 'punct', 'cc'

In [6]:

# transaktsioonid alist of lists of dicts
transactions = v30.get_transactions(verb='ärkama', verb_compound='', columns=['deprel','lemma'], skip_deprels=['punct', 'cc'])
print('transactions')
display(transactions[:3])

# alist of lists of dicts teisendatakse apriori algoritmi jaoks sobivale kujule -list of lists of tuples
dataset = v30.dict_to_apriori(transactions)
print('dataset')
display(dataset[:3])

# rakendatakse aprioi algoritm, tulemus prinditakse välja ekraanile
v30.apriori(dataset, min_support=0.05)

transactions


[[{'deprel': 'mark', 'lemma': 'kui'},
  {'deprel': 'nsubj', 'lemma': 'mina'},
  {'deprel': 'obl', 'lemma': 'õlg'},
  {'deprel': 'conj', 'lemma': 'ütlema'}],
 [{'deprel': 'mark', 'lemma': 'kui'},
  {'deprel': 'nsubj', 'lemma': 'tema'},
  {'deprel': 'obl', 'lemma': 'hommik'},
  {'deprel': 'conj', 'lemma': 'küsima'}],
 [{'deprel': 'mark', 'lemma': 'kus'},
  {'deprel': 'nsubj', 'lemma': 'mina'},
  {'deprel': 'conj', 'lemma': 'mõistma'}]]

dataset


[[('mark', 'kui'), ('nsubj', 'mina'), ('obl', 'õlg'), ('conj', 'ütlema')],
 [('mark', 'kui'), ('nsubj', 'tema'), ('obl', 'hommik'), ('conj', 'küsima')],
 [('mark', 'kus'), ('nsubj', 'mina'), ('conj', 'mõistma')]]

min_support: 0.05


Unnamed: 0,support,itemsets
3,0.1213,"((mark, kui))"
7,0.118051,"((obl, hommik))"
6,0.076895,"((obl, elu))"
8,0.076895,"((obl, kell))"
5,0.075632,"((nsubj, tema))"
4,0.07509,"((nsubj, mina))"
2,0.066245,"((mark, et))"
1,0.05722,"((aux, olema))"
0,0.05704,"((advmod, siis))"


Unnamed: 0,"(acl, muutma)","(acl, surema)","(acl:relcl, võitma)","(advcl, Mõõdukas)","(advcl, Perot)","(advcl, Saaremaa)","(advcl, abielluma)","(advcl, aeg)","(advcl, aitama)","(advcl, ajama)",...,"(xcomp, viril)","(xcomp, voodi)","(xcomp, väike)","(xcomp, väljapuhanud)","(xcomp, väljendama)","(xcomp, väsinud)","(xcomp, õhtusöök)","(xcomp, õnn)","(xcomp, õnnelik)","(xcomp, õudus)"
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5535,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5536,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5537,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5538,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
v30._conn.close()