In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width: 95% !important; }</style>"))
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)

In [None]:
from collections import defaultdict
from tqdm.autonotebook import tqdm
import sys
from pathlib import Path
from os.path import exists, getsize, join
import logging
import logging.config
# import time
# import random

In [None]:
homedir = Path.home()
wm2logconfig = {
    'version': 1,
    'disable_existing_loggers': False,
    'root': {
#        'handlers': ['console', 'file_handler'],
        'handlers': ['console', 'file_handler'],
        'level': 'DEBUG'
    },
    'formatters': {
        'default_formatter': {
            'format': '%(asctime)s %(levelname)s %(message)s',
            'datefmt': '%d.%m.%Y %H:%M:%S'
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'default_formatter',
            'level': 'DEBUG'
        },
        'file_handler': {
            'class': 'logging.FileHandler',
            'formatter': 'default_formatter',
            'filename': join(homedir, 'wm2log.txt'),
            'level': 'DEBUG'
        }
    },
}

logging.config.dictConfig(wm2logconfig)
logger = logging.getLogger('wm2')
logger.info('info log')
logger.debug('debug log')

In [None]:
from lib import corpus, dbutil

In [None]:
# dbc = dbutil.DatabaseConnection('data/s24_c2.db')
# dbc = dbutil.DatabaseConnection('data/ns24_6.db')
# dbc = dbutil.DatabaseConnection('data/ns24_7.db')
dbc = dbutil.DatabaseConnection('data/ns24_freq2.db')

#### Errors

In [None]:
errquery1 = "nouncase in Ill,Gen and pos in PROPN,NOUN and foo = 0 and len >   a and naat compound and len ~= 10 and len<10"
errdf1, _, _ = dbutil.get_frequency_dataframe(dbc, query=errquery1, grams=False, lemmas=False)
errdf1

#### Basic information queries.

In [None]:
bgquery1 = "pos = 'NOUN' and form = 'silmäsi' and frequency > 10"
# bgquery1 = "pos = NOUN"
# bgquery1 = "nouncase in ine,gen"
# bgquery1 = "pos = NOUN and len > 10"
# bgquery1 = "pos = VERB and frequency > 100"
# bgquery1 = "pos not in NOUN,ADJ"
# bgquery1 = "pos = NOUN and frequency > 100"
# bgquery1 = "form like 'tai%' and bigramfreq > 18000000"
# bgquery1 = "form = 'tai' and pos = 'CCONJ'"
# bgquery1 = "form = 'auto'"
# bgquery1 = "form = voit"
bgquery1 = "fOrm = VOI"
# bgquery1 = "fOrm like ett%"
# bgquery1 = "end not in a"
# bgquery1 = "form like '%ssa'"
# bgquery1 = "end = ssa"
# bgquery1 = "form not like a%"
# bgquery1 = "start = a"
# bgquery1 = "form like 'auto%' and compound"
# bgquery1 = "lemma = 'voi'"
bgquery1 = "lemma in voi,voida"
bgquery1 = "lemma like voi%"
bgquery1 = "lemma = autotalli"
# bgquery1 = "lemma not in voi,voida"
# bgquery1 = 'form = ja'
# bgquery1 = "form = 'on' and pos in AUX,VERB"
# bgquery1 = 'form = on'
# bgquery1 = 'lemma = voida and form like voi%'
# bgquery1 = 'form = se'
# bgdf1, _, _ = dbutil.get_frequency_dataframe(s24con1, query=bgquery1, grams=True, lemmas=True)
# bgdf1, _, _ = dbutil.get_frequency_dataframe(s24con1, query=bgquery1, grams=True, lemmas=False, orderby='w.form ASC')
bgdf1, _, _ = dbutil.get_frequency_dataframe(dbc, query=bgquery1, grams=True, lemmas=True)
bgdf1 = dbutil.add_relative_frequencies(dbc, bgdf1)
bgdf1

#### Feature queries.

In [None]:
# bgquery2 = "posspers != '_'"
# bgquery2 = "posspers = 1"
# bgquery2 = "nouncase = 'Gen'"
# bgquery2 = "nouncase not in Ill,Gen"
# bgquery2 = "nouncase in Ill,Gen"
# bgquery2 = "nouncase in Ill,Gen and pos in PROPN,NOUN"
# bgquery2 = "nouncase in Ill,Gen and pos in PRON,PROPN and frequency > 1000"
bgquery2 = "pos = NOUN and derivation in Inen,Lainen"
bgquery2 = "pos in aux,verb"
# bgquery2 = "clitic not in Han,Ko"
# bgquery2 = "clitic in Han,Ko"
# bgquery2 = "clitic in Han,Ko and frequency > 10000"
# bgquery2 = "clitic in Pa"
# bgquery2 = "clitic = Han"
# bgquery2 = "clitic != Han"
bgquery2 = "clitic != _"
bgquery2 = "clitic = _"
# bgquery2 = "clitic in _"
bgquery2 = "clitic not in Kin"
# bgquery2 = "clitic != Kin"
bgquery2 = "derivation != Inen"
# bgdf2, _, _ = dbutil.get_frequency_dataframe(s24con1, query=bgquery1, grams=True, lemmas=False, orderby='w.form ASC')
# bgdf2, _, _ = dbutil.get_frequency_dataframe(dbc, query=bgquery2, grams=False, lemmas=False)
bgdf2, _, _ = dbutil.get_frequency_dataframe(dbc, query=bgquery2, grams=False, lemmas=False)
bgdf2

#### Form start/middle/end queries.

In [None]:
# bgquery3 = "form like '%ssa'"
bgquery3 = "form like 'aut%'"
# bgquery3 = "lemma = 'auto' and start = aut"
bgquery3 = "start = aut"
# bgquery3 = "start != aut"
bgquery3 = "start in auv,aus"
# bgquery3 = "middle = tta and lemma like 'v%'"
# bgquery3 = "middle = tta"
bgquery3 = "middle = ta"
bgquery3 = "middle in ta,sa"
# bgquery3 = "middle not in ta,sa"
# bgquery3 = "middle != tta"
# bgquery3 = "end != ssa"
# bgquery3 = "end = ssa"
# bgquery3 = "end in ssa"
# bgquery3 = "end not in ssa,sta"
bgquery3 = "end in ssa,ssä"
# bgdf3, _, _ = dbutil.get_frequency_dataframe(s24con1, query=bgquery1, grams=True, lemmas=True)
# bgdf3, _, _ = dbutil.get_frequency_dataframe(s24con1, query=bgquery1, grams=True, lemmas=False, orderby='w.form ASC')
bgdf3, _, _ = dbutil.get_frequency_dataframe(dbc, query=bgquery3, grams=False, lemmas=True)
bgdf3

#### Lemma/form information queries.

In [None]:
# bgquery4 = "form like 'auto%' and compound"
bgquery4 = "compound"
bgquery4 = "not compound"
# bgquery4 = "pos = 'NOUN' and len = 5"
# bgquery4 = "lemmalen > 10"
# bgquery4 = "lemmafreq < 40"
# bgquery4 = "amblemma < 40"
# bgdf4, _, _ = dbutil.get_frequency_dataframe(s24con1, query=bgquery4, grams=True, lemmas=True)
# bgdf4, _, _ = dbutil.get_frequency_dataframe(s24con1, query=bgquery4, grams=True, lemmas=False, orderby='w.form ASC')
bgdf4, _, _ = dbutil.get_frequency_dataframe(dbc, query=bgquery4, grams=False, lemmas=True)
bgdf4

#### Frequency queries

In [None]:
bgquery5 = "ambform < 0.99"
bgquery5 = "form = 'silmäsi' and frequency > 10"
# bgquery5 = "form = voi and frequency > 100"
bgquery5 = "form = voi and frequency > 10 and amblemma < 0.99"
# bgquery5 = "form = 'silmäsi' and frequency > 10 and relfrequency > 2"
# bgquery5 = "ambform < x"
bgdf5, _, _ = dbutil.get_frequency_dataframe(dbc, query=bgquery5, grams=True, lemmas=True)
bgdf5

#### Input files

In [None]:
filename = 'samples/formlist.txt'
wordinput = dbutil.get_wordinput(filename)
formdf, _, _ = dbutil.get_frequency_dataframe(dbc, query=wordinput, grams=False, lemmas=True)
formdf

In [None]:
filename = 'samples/nonwordlist.txt'
wordinput = dbutil.get_wordinput(filename)
nonworddf = dbutil.get_unword_bigrams(dbc, wordinput)
nonworddf

In [None]:
brokenfilename = 'samples/winform2.txt'
wordinput = dbutil.get_wordinput(brokenfilename)
formdf, _, _ = dbutil.get_frequency_dataframe(dbc, query=wordinput, grams=False, lemmas=True)
formdf