# Load raw files into data frames

In [1]:
import re
import numpy as np
import pandas as pd

In [77]:
def stripFN(lines):
    fname = []
    pattern = r'\\([A-Z0-9-._]*)\.txt\.stp'
    regex = re.compile(pattern)
    for l in lines:
        fname.append(regex.search(l).group(1))
    return fname

In [7]:
impfRaw = [line.rstrip('\n') for line in open('tregex-results\lobImpfs.txt')]
fni = stripFN(impfRaw[::2])
impfs = pd.DataFrame(data={'vbp':impfRaw[1::2], 'fn':fni})

progRaw = [line.rstrip('\n') for line in open('tregex-results\lobProgs.txt')]
fnp = stripFN(progRaw[::3])
progs = pd.DataFrame(data={'vbg':progRaw[1::3], 'beform':progRaw[2::3], 'fn':fnp})

In [78]:
impfFLOBRaw = [line.rstrip('\n') for line in open(r"tregex-results\flobImpfs.txt")]
fniflob = stripFN(impfFLOBRaw[::2])
impfsflob = pd.DataFrame(data={'vbp':impfFLOBRaw[1::2], 'fn':fniflob})

progFLOBRaw = [line.rstrip('\n') for line in open(r'tregex-results\flobProgs.txt')]
fnpflob = stripFN(progFLOBRaw[::3])
progsflob = pd.DataFrame(data={'vbg':progFLOBRaw[1::3], 'beform':progFLOBRaw[2::3], 'fn':fnpflob})

# Lemmatize

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def lmt(word):
    return wnl.lemmatize(word, pos="v")

In [14]:
impfs["lemma"] = impfs["vbp"].map(lambda x: lmt(x))
progs["lemma"] = progs["vbg"].map(lambda x: lmt(x))

Unnamed: 0,vbg,beform,fn,lemma
0,insisting,(VBZ is),CLOBTH_A.TXT,insist
1,boycotting,(VBZ is),CLOBTH_A.TXT,boycott
2,seeking,(VBP are),CLOBTH_A.TXT,seek
3,studying,(VBP are),CLOBTH_A.TXT,study
4,having,(VBZ is),CLOBTH_A.TXT,have


In [79]:
#flob parse
impfsflob["lemma"] = impfsflob["vbp"].map(lambda x: lmt(x))
progsflob["lemma"] = progsflob["vbg"].map(lambda x: lmt(x))

Unnamed: 0,vbp,fn,lemma
0,pledges,C-A01-TAG,pledge
1,have,C-A01-TAG,have
2,is,C-A01-TAG,be
3,win,C-A01-TAG,win
4,are,C-A01-TAG,be


In [21]:
impfs.loc[(impfs['lemma']=="'s") | (impfs['lemma']=="'re") | (impfs['lemma']=="'m"), "lemma"] = "be"
impfs.loc[(impfs['lemma']=="'ve"), "lemma"] = "have"

Unnamed: 0,vbp,fn,lemma
53,parliament's,CLOBTH_A.TXT,parliament's
120,Kaunda's,CLOBTH_A.TXT,Kaunda's
148,Tito's,CLOBTH_A.TXT,Tito's
305,Avon's,CLOBTH_A.TXT,Avon's
316,Godber's,CLOBTH_A.TXT,Godber's
759,corporation's,CLOBTH_A.TXT,corporation's
836,party's,CLOBTH_A.TXT,party's
906,l'Isle's,CLOBTH_A.TXT,l'Isle's
955,corporation's,CLOBTH_A.TXT,corporation's
1020,board's,CLOBTH_A.TXT,board's


In [80]:
impfsflob.loc[(impfsflob['lemma']=="'s") | (impfsflob['lemma']=="'re") | (impfsflob['lemma']=="'m"), "lemma"] = "be"
impfsflob.loc[(impfsflob['lemma']=="'ve"), "lemma"] = "have"

In [51]:
checkIs = {f for f in impfs.loc[impfs.lemma == impfs.vbp].lemma.values.tolist()} - {f for f in impfs.loc[impfs.lemma != impfs.vbp].lemma.values.tolist()}

checkPs = {f for f in progs.loc[progs.lemma == progs.vbg].lemma.values.tolist()} - {f for f in progs.loc[progs.lemma != progs.vbg].lemma.values.tolist()}
#pd.to_csv("checkPs.csv")
pd.Series([t for t in checkPs], name="proglem").to_csv("checkPs.csv")


In [86]:
checkIsFLOB = {f for f in impfsflob.loc[impfsflob.lemma == impfsflob.vbp].lemma.values.tolist()} - {f for f in impfsflob.loc[impfsflob.lemma != impfsflob.vbp].lemma.values.tolist()}
checkPsFLOB = {f for f in progsflob.loc[progsflob.lemma == progsflob.vbg].lemma.values.tolist()} - {f for f in progsflob.loc[progsflob.lemma != progsflob.vbg].lemma.values.tolist()}

pd.Series([t for t in checkIsFLOB], name="impflem").to_csv("checkIsFLOB.csv")
pd.Series([t for t in checkPsFLOB], name="proglem").to_csv("checkPsFLOB.csv")

In [58]:
checkPd = pd.read_csv("checkPs-done.csv", usecols=[1, 2], names=["orig", "lemma"])
checkId = pd.read_csv("checkIs-done.csv", usecols=[1, 2], names=["orig", "lemma"])

checkPDict = {k[0]: k[1] for k in checkPd.values.tolist()}
checkIDict = {k[0]: k[1] for k in checkId.values.tolist()}

In [65]:
impfs["lemma"] = impfs["lemma"].map(checkIFLOBDict).fillna(impfs['lemma'])
progs["lemma"] = progs["lemma"].map(checkPFLOBDict).fillna(progs['lemma'])

In [89]:
checkPdFLOB = pd.read_csv("checkPsFLOB-done.csv", usecols=[1, 2], names=["orig", "lemma"])
checkIdFLOB = pd.read_csv("checkIsFLOB-done.csv", usecols=[1, 2], names=["orig", "lemma"])

checkPFLOBDict = {k[0]: k[1] for k in checkPdFLOB.values.tolist()}
checkIFLOBDict = {k[0]: k[1] for k in checkIdFLOB.values.tolist()}

impfsflob["lemma"] = impfsflob["lemma"].map(checkIFLOBDict).fillna(impfsflob['lemma'])
progsflob["lemma"] = progsflob["lemma"].map(checkPFLOBDict).fillna(progsflob['lemma'])

Unnamed: 0,vbp,fn,lemma
0,pledges,C-A01-TAG,pledge
1,have,C-A01-TAG,have
2,is,C-A01-TAG,be
3,win,C-A01-TAG,win
4,are,C-A01-TAG,be
5,have,C-A01-TAG,have
6,plans,C-A01-TAG,plan
7,are,C-A01-TAG,be
8,are,C-A01-TAG,be
9,underlines,C-A01-TAG,underline


In [67]:
impfs.to_csv("lobImpf.csv")
progs.to_csv("lobProg.csv")

In [94]:
impfsflob.to_csv("flobImpf.csv")
progsflob.to_csv("flobProg.csv")

In [124]:
len(progsflob.loc[(progsflob.lemma!="x")& (progsflob.beform.str.contains("\(VBD")==False)])/len(impfsflob.loc[impfsflob.lemma!="x"])
#len(progs.loc[(progs.lemma!="x")& (progs.beform.str.contains("\(VBD")==False)])/len(impfs.loc[impfs.lemma!="x"])

#len(progs.loc[progs.lemma!="x"])/len(impfs.loc[impfs.lemma!="x"])

0.04841666457396851

In [121]:
len(progsflob.loc[& (progsflob.beform.str.contains("\(VBD")==True)])

1962