In [30]:
import sqlite3
import pandas as pd

### Phrase counts

In [72]:
con = sqlite3.connect("tagged_noun_phrases.db")
cur = con.cursor()

In [65]:
cur.execute("SELECT raw_text, count(*) FROM tagged_phrases GROUP BY raw_text ORDER BY count(*) DESC")
rows = cur.fetchall()

In [73]:
con.close()

In [67]:
data = {'count': [], 'phrase': []}
for row in rows:
    data['count'].append(row[1])
    data['phrase'].append(row[0])

In [69]:
df = pd.DataFrame.from_dict(data)

In [70]:
df

Unnamed: 0,count,phrase
0,32,Euroopa tunnetamine kodu
1,32,Euroopa tunnetamine
2,21,ca 5%
3,19,Euroopa Liit
4,19,Balti riik
...,...,...
2166,1,12aastane Alan
2167,1,12. juuni
2168,1,11. juuni
2169,1,10. minut


In [71]:
con = sqlite3.connect('tagged_noun_phrases.db')
cur = con.cursor()

df.to_sql('phrases_counts', con, if_exists='replace', index=False)
con.close()

### Phrases with extraction and NER patterns

In [48]:
def get_extraction_pattern(con):
    cur = con.cursor()
    cur.execute("SELECT extraction_pattern, count(*) FROM tagged_phrases GROUP BY extraction_pattern ORDER BY count(*) DESC")
    
    rows = cur.fetchall()
    return rows

def get_patterns_ner(con):
    con.cursor()
    cur.execute("SELECT ner_pattern, count(*) FROM tagged_phrases GROUP BY ner_pattern ORDER BY count(*) DESC")
    
    rows = cur.fetchall()  
    return rows

def get_phrases(con):
    con.cursor()
    cur.execute("SELECT raw_text, count(*) FROM tagged_phrases GROUP BY raw_text ORDER BY count(*) DESC")
    
    rows = cur.fetchall()  
    return rows

In [49]:
con = sqlite3.connect('tagged_noun_phrases.db')
cur = con.cursor()

data = {'count': [], 'extraction_pattern': [], 'ner_pattern': [], 'phrase': []}
extraction_patterns = get_extraction_pattern(con)
patterns_ner = get_patterns_ner(con)
                                  
for pattern in extraction_patterns:
    for ner in patterns_ner:
        #cur.execute("SELECT * FROM tagged_phrases WHERE extraction_pattern = ? AND ner_pattern = ?", (pattern[0], ner[0]))
        #criteria_freq = len(cur.fetchall()) 
        
        cur.execute("SELECT raw_text, count(*) FROM tagged_phrases WHERE extraction_pattern = ? AND ner_pattern = ? GROUP BY raw_text ORDER BY count(*) DESC", (pattern[0], ner[0]))
        phrase_freqs = cur.fetchall()  
        
        for phrase in phrase_freqs:
            data['count'].append(phrase[1])
            data['extraction_pattern'].append(pattern[0])
            data['ner_pattern'].append(ner[0])
            data['phrase'].append(phrase[0])

In [50]:
con.close()

In [51]:
df2 = pd.DataFrame.from_dict(data)

In [52]:
df2

Unnamed: 0,count,extraction_pattern,ner_pattern,phrase
0,12,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,häälteenamus konventsioon
1,11,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,suhe juht
2,11,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,eestlane naasmine
3,5,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,suu maigutamine
4,5,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,riigikontroll kontroll
...,...,...,...,...
2177,1,"1 2 amod,2 0 root,H-H",OTHER-OTHER,Vaba Euroopa
2178,1,"1 2 amod,2 0 root,H-H",OTHER-OTHER,Rahvusvaheline Tööorganisatsioon
2179,2,"1 0 root,2 1 flat,3 1 flat,Y-Y-S",OTHER-OTHER-OTHER,is in the
2180,1,"1 0 root,2 1 flat,3 1 flat,S-S-Y",OTHER-OTHER-OTHER,per ardua ad


In [54]:
# evidently, there are some phrases that occur with multiple extraction and/or NER patterns
len(df2['phrase'].unique())

2171

In [55]:
df2 = df2.sort_values('phrase')

In [62]:
df2

Unnamed: 0,count,extraction_pattern,ner_pattern,phrase
1526,1,"1 2 amod,2 0 root,A-S",OTHER-OTHER,000kroonine alghind
1311,1,"1 2 amod,2 0 root,O-S",OTHER-OTHER,10. minut
1310,1,"1 2 amod,2 0 root,O-S",OTHER-OTHER,11. juuni
1309,1,"1 2 amod,2 0 root,O-S",OTHER-OTHER,12. juuni
1697,1,"1 2 amod,2 0 root,A-H",OTHER-OTHER,12aastane Alan
...,...,...,...,...
578,1,"1 2 nmod,2 0 root,Y-S",OTHER-OTHER,üx valgusfoor
45,1,"1 2 nmod,2 0 root,S-S",OTHER-OTHER,üür hind
1803,1,"1 2 nmod,2 0 root,3 2 nmod,S-S-H",OTHER-OTHER-OTHER,üür hind Draamateater
2127,1,"1 2 nmod,2 0 root,H-N",LOC-OTHER,Šveits kümme


In [63]:
con = sqlite3.connect('tagged_noun_phrases.db')
cur = con.cursor()

df2.to_sql('phrases_patterns', con, if_exists='replace', index=False)
con.close()

### Phrases with pattern context

In [82]:
con = sqlite3.connect('tagged_noun_phrases.db')
cur = con.cursor()

data = {'count': [], 'phrase': [], 'extraction_pattern': [], 'pat_%': [], 'ner_pattern': [], 'ner_%': []}
phrases = get_phrases(con)
                                  
for phrase in phrases:
    cur.execute("SELECT extraction_pattern, count(*) FROM tagged_phrases WHERE raw_text = ? GROUP BY extraction_pattern ORDER BY count(*) DESC", (phrase[0],))
    pattern_ex_freqs = cur.fetchall()
    for ex_pattern in pattern_ex_freqs:
        cur.execute("SELECT ner_pattern, count(*) FROM tagged_phrases WHERE raw_text = ? AND extraction_pattern = ? GROUP BY ner_pattern ORDER BY count(*) DESC", (phrase[0], ex_pattern[0]))
        pattern_ner_freqs = cur.fetchall()
            
        for ner_pattern in pattern_ner_freqs:
            data['count'].append(phrase[1])
            data['phrase'].append(phrase[0])
            data['extraction_pattern'].append(ex_pattern[0])
            # extraction pattern % of current phrase
            data['pat_%'].append(ex_pattern[1]/phrase[1])
            data['ner_pattern'].append(ner_pattern[0])
            # ner sequence % of current extraction pattern
            data['ner_%'].append(ner_pattern[1]/ex_pattern[1])

In [83]:
con.close()

In [84]:
df3 = pd.DataFrame.from_dict(data)

In [85]:
df3

Unnamed: 0,count,phrase,extraction_pattern,pat_%,ner_pattern,ner_%
0,32,Euroopa tunnetamine kodu,"1 2 nmod,2 0 root,3 2 nmod,H-S-S",1.0,LOC-OTHER-OTHER,1.0
1,32,Euroopa tunnetamine,"1 2 nmod,2 0 root,H-S",1.0,LOC-OTHER,1.0
2,21,ca 5%,"1 2 nmod,2 0 root,Y-N",1.0,OTHER-OTHER,1.0
3,19,Euroopa Liit,"1 2 nmod,2 0 root,H-H",1.0,OTHER-OTHER,1.0
4,19,Balti riik,"1 2 amod,2 0 root,H-S",1.0,OTHER-OTHER,1.0
...,...,...,...,...,...,...
2177,1,12aastane Alan,"1 2 amod,2 0 root,A-H",1.0,OTHER-OTHER,1.0
2178,1,12. juuni,"1 2 amod,2 0 root,O-S",1.0,OTHER-OTHER,1.0
2179,1,11. juuni,"1 2 amod,2 0 root,O-S",1.0,OTHER-OTHER,1.0
2180,1,10. minut,"1 2 amod,2 0 root,O-S",1.0,OTHER-OTHER,1.0


In [86]:
con = sqlite3.connect('tagged_noun_phrases.db')
cur = con.cursor()

df3.to_sql('phrases_context', con, if_exists='replace', index=False) # writes to file
con.close()