### **Data and LFs used here are for illustrative purposes only**
These LFs were not created with any target in mind, so outputs are *not interpretable* as probabilities.  Replace with your own data and LFs.

In [1]:
import pandas as pd

from utils import load_toy_data, load_lf_yaml, lf_from_dict, find_maxent_ab
from levi import LEVI, VotesData

In [2]:
# (1) load toy data (20 newsgroups)
data = load_toy_data()
data.head()

Unnamed: 0,text,header,category
0,I was wondering if anyone out there could enl...,From: lerxst@wam.umd.edu (where's my thing)\nS...,rec.autos
1,A fair number of brave souls who upgraded thei...,From: guykuo@carson.u.washington.edu (Guy Kuo)...,comp.sys.mac.hardware
2,"well folks, my mac plus finally gave up the gh...",From: twillis@ec.ecn.purdue.edu (Thomas E Will...,comp.sys.mac.hardware
3,Robert J.C. Kyanko (rob@rjck.UUCP) wrote:\n> a...,From: jgreen@amber (Joe Green)\nSubject: Re: W...,comp.graphics
4,"From article <C5owCB.n3p@world.std.com>, by to...",From: jcm@head-cfa.harvard.edu (Jonathan McDow...,sci.space


In [3]:
# (2) load random LFs and use them to vote on data
LFs = load_lf_yaml('sample_LFs.yaml')  # load LFs as dict
votes_data = VotesData(data)

for LF in LFs:
    lf_object = lf_from_dict(LF)  # must be LabelingFunction object
    votes_data.apply(lf_object)

# display votes data as dataframe
votes_data.df.head()

Unnamed: 0,[+] includes-quote,[+] edu-not-sci,[+] is-reply,[-] includes-phone-number,[-] is-rec-or-talk,[-] more-than-30-lines
0,0,1,0,0,1,0
1,0,1,0,0,0,0
2,1,1,0,0,0,1
3,1,0,1,0,0,0
4,1,0,1,0,0,0


In [4]:
# (3) set default priors via maxent (if using sample_LFs these are unrealistic)
rho_a, rho_b = find_maxent_ab(max_plausible_value = 0.5)
pos_tpr_a, pos_tpr_b = find_maxent_ab(max_plausible_value = 0.5)
pos_fpr_a, pos_fpr_b = find_maxent_ab(max_plausible_value = 0.1)

priors = {
    'rho_a': rho_a, 'rho_b': rho_b,
    'pos_tpr_a': pos_tpr_a, 'pos_tpr_b': pos_tpr_b,
    'pos_fpr_a': pos_fpr_a, 'pos_fpr_b': pos_fpr_b
}

In [5]:
# (4) get output from LEVI on votes data
levi = LEVI(
        LFs = LFs,
        ab_priors = priors
    )
p = levi.predict_proba(votes_data.df)  

In [6]:
# (5) merge votes data with output
results = pd.concat(
    [
        votes_data.data,
        votes_data.df,
        p
    ],
    axis=1
)

results.sample(random_state=42)

Unnamed: 0,text,header,category,[+] includes-quote,[+] edu-not-sci,[+] is-reply,[-] includes-phone-number,[-] is-rec-or-talk,[-] more-than-30-lines,p
7492,\nCould someone please post any info on these ...,From: rrn@po.CWRU.Edu (Robert R. Novitskey)\nS...,comp.sys.mac.hardware,0,1,0,0,0,0,0.64919
