In [1]:
import sqlite3
import pandas as pd

In [2]:
con = sqlite3.connect("tagged_noun_phrases2.db")
cur = con.cursor()

In [3]:
cur.execute("SELECT raw_text, count(*) FROM tagged_phrases GROUP BY raw_text ORDER BY count(*) DESC")
rows = cur.fetchall()

In [4]:
con.close()

In [5]:
data = {'count': [], 'phrase': []}
for row in rows:
    data['count'].append(row[1])
    data['phrase'].append(row[0])

In [6]:
df = pd.DataFrame.from_dict(data)

In [7]:
df

Unnamed: 0,count,phrase
0,33,Euroopa tunnetamine kodu
1,33,Euroopa tunnetamine
2,21,ca 5%
3,19,Balti riik
4,14,Nõukogu Liit
...,...,...
2205,1,12. juuni
2206,1,10. oktoober
2207,1,10. jaanuar
2208,1,10. aprill


In [8]:
con = sqlite3.connect('tagged_noun_phrases2.db')
cur = con.cursor()

df.to_sql('phrases_counts', con, if_exists='replace', index=False)
con.close()

### Phrases with extraction and NER patterns

In [9]:
def get_extraction_pattern(con):
    cur = con.cursor()
    cur.execute("SELECT extraction_pattern, count(*) FROM tagged_phrases GROUP BY extraction_pattern ORDER BY count(*) DESC")
    
    rows = cur.fetchall()
    return rows

def get_patterns_ner(con):
    con.cursor()
    cur.execute("SELECT ner_pattern, count(*) FROM tagged_phrases GROUP BY ner_pattern ORDER BY count(*) DESC")
    
    rows = cur.fetchall()  
    return rows

def get_phrases(con):
    con.cursor()
    cur.execute("SELECT raw_text, count(*) FROM tagged_phrases GROUP BY raw_text ORDER BY count(*) DESC")
    
    rows = cur.fetchall()  
    return rows

In [10]:
con = sqlite3.connect('tagged_noun_phrases2.db')
cur = con.cursor()

data = {'count': [], 'extraction_pattern': [], 'ner_pattern': [], 'phrase': []}
extraction_patterns = get_extraction_pattern(con)
patterns_ner = get_patterns_ner(con)
                                  
for pattern in extraction_patterns:
    for ner in patterns_ner:
        #cur.execute("SELECT * FROM tagged_phrases WHERE extraction_pattern = ? AND ner_pattern = ?", (pattern[0], ner[0]))
        #criteria_freq = len(cur.fetchall()) 
        
        cur.execute("SELECT raw_text, count(*) FROM tagged_phrases WHERE extraction_pattern = ? AND ner_pattern = ? GROUP BY raw_text ORDER BY count(*) DESC", (pattern[0], ner[0]))
        phrase_freqs = cur.fetchall()  
        
        for phrase in phrase_freqs:
            data['count'].append(phrase[1])
            data['extraction_pattern'].append(pattern[0])
            data['ner_pattern'].append(ner[0])
            data['phrase'].append(phrase[0])

In [11]:
con.close()

In [12]:
df2 = pd.DataFrame.from_dict(data)

In [13]:
df2

Unnamed: 0,count,extraction_pattern,ner_pattern,phrase
0,12,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,riigikontroll kontroll
1,10,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,suhe juht
2,9,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,eestlane naasmine
3,7,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,häälteenamus konventsioon
4,5,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,politsei pressiesindaja
...,...,...,...,...
2216,2,"1 2 nmod,2 0 root,3 2 nmod,H-H-S",ORG-ORG-OTHER,Eesti Draamateater toetus
2217,2,"1 0 root,2 1 flat,3 1 flat,Y-Y-S",OTHER-OTHER-OTHER,is in the
2218,1,"1 0 root,2 1 flat,3 1 flat,S-S-Y",OTHER-OTHER-OTHER,per ardua ad
2219,1,"1 0 root,2 1 flat,3 1 flat,S-S-Y",OTHER-OTHER-OTHER,don worry be


In [14]:
# evidently, there are some phrases that occur with multiple extraction and/or NER patterns
len(df2['phrase'].unique())

2210

In [15]:
df2 = df2.sort_values('phrase')

In [16]:
df2

Unnamed: 0,count,extraction_pattern,ner_pattern,phrase
1335,1,"1 2 amod,2 0 root,O-S",OTHER-OTHER,01. majandusaasta
1334,1,"1 2 amod,2 0 root,O-S",OTHER-OTHER,10. aprill
1333,1,"1 2 amod,2 0 root,O-S",OTHER-OTHER,10. jaanuar
1332,1,"1 2 amod,2 0 root,O-S",OTHER-OTHER,10. oktoober
1331,1,"1 2 amod,2 0 root,O-S",OTHER-OTHER,12. juuni
...,...,...,...,...
52,1,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,üür hind
1856,1,"1 2 nmod,2 0 root,3 2 nmod,S-S-H",OTHER-OTHER-OTHER,üür hind Draamateater
842,1,"1 2 nmod,2 0 root,H-H",OTHER-OTHER,Šveits Zürich
2099,1,"1 2 nmod,2 0 root,H-N",LOC-OTHER,Šveits kümme


In [17]:
con = sqlite3.connect('tagged_noun_phrases2.db')
cur = con.cursor()

df2.to_sql('phrases_patterns', con, if_exists='replace', index=False)
con.close()

### Phrases with NER context

In [26]:
con = sqlite3.connect('tagged_noun_phrases2.db')
cur = con.cursor()

data = {'count': [], 'phrase': [], 'extraction_pattern': [], 'pat_%': [], 'ner_pattern': [], 'ner_%': []}
phrases = get_phrases(con)
                                  
for phrase in phrases:
    cur.execute("SELECT extraction_pattern, count(*) FROM tagged_phrases WHERE raw_text = ? GROUP BY extraction_pattern ORDER BY count(*) DESC", (phrase[0],))
    pattern_ex_freqs = cur.fetchall()
    for ex_pattern in pattern_ex_freqs:
        cur.execute("SELECT ner_pattern, count(*) FROM tagged_phrases WHERE raw_text = ? AND extraction_pattern = ? GROUP BY ner_pattern ORDER BY count(*) DESC", (phrase[0], ex_pattern[0]))
        pattern_ner_freqs = cur.fetchall()
            
        for ner_pattern in pattern_ner_freqs:
            data['count'].append(phrase[1])
            data['phrase'].append(phrase[0])
            data['extraction_pattern'].append(ex_pattern[0])
            # extraction pattern % of current phrase
            data['pat_%'].append(ex_pattern[1]/phrase[1])
            data['ner_pattern'].append(ner_pattern[0])
            # ner sequence % of current extraction pattern
            data['ner_%'].append(ner_pattern[1]/ex_pattern[1])

In [27]:
con.close()

In [28]:
df3 = pd.DataFrame.from_dict(data)

In [29]:
df3

Unnamed: 0,count,phrase,extraction_pattern,pat_%,ner_pattern,ner_%
0,33,Euroopa tunnetamine kodu,"1 2 nmod,2 0 root,3 2 nmod,H-S-S",1.0,LOC-OTHER-OTHER,1.0
1,33,Euroopa tunnetamine,"1 2 nmod,2 0 root,H-S",1.0,LOC-OTHER,1.0
2,21,ca 5%,"1 2 nmod,2 0 root,Y-N",1.0,OTHER-OTHER,1.0
3,19,Balti riik,"1 2 amod,2 0 root,H-S",1.0,OTHER-OTHER,1.0
4,14,Nõukogu Liit,"1 2 nmod,2 0 root,H-H",1.0,OTHER-OTHER,1.0
...,...,...,...,...,...,...
2216,1,12. juuni,"1 2 amod,2 0 root,O-S",1.0,OTHER-OTHER,1.0
2217,1,10. oktoober,"1 2 amod,2 0 root,O-S",1.0,OTHER-OTHER,1.0
2218,1,10. jaanuar,"1 2 amod,2 0 root,O-S",1.0,OTHER-OTHER,1.0
2219,1,10. aprill,"1 2 amod,2 0 root,O-S",1.0,OTHER-OTHER,1.0


In [30]:
con = sqlite3.connect('tagged_noun_phrases2.db')
cur = con.cursor()

df3.to_sql('phrases_context', con, if_exists='replace', index=False) # writes to file
con.close()