In [1]:
import csv

In [2]:
from os import listdir
from os.path import isfile, join
import xlsxwriter
from collections import Counter


In [3]:
mypath = './WikiHow Printer/WikiHowSummary/'
pages = [f[:-4] for f in listdir(mypath) if isfile(join(mypath, f))]
# print(pages)

In [4]:
import pickle
links = pickle.load(open('./html_links_printer.pkl','rb'))
name_to_link = {}
for link in links:
    name_to_link[link.split('/')[3]] = link

In [8]:
from flashtext import KeywordProcessor
import nltk
from nltk.corpus import stopwords
from nltk.corpus import brown
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize, sent_tokenize
lemmatizer = WordNetLemmatizer() 

In [9]:
stop_words = set(stopwords.words('english'))
# verbs = {word for word, pos in brown.tagged_words() if pos.startswith('V')}
verbs = pickle.load(open('./verb_entities.pkl','rb'))

In [11]:
verbs2 = []
for page1 in pages:
    text = open('./WikiHow Printer/WikiHow/'+ page1 +'.txt',encoding="utf8").read()
    tokenized = sent_tokenize(text)
    for sent in tokenized:
        for i in nltk.pos_tag(word_tokenize(sent)):
            word, pos = i
            if(pos.startswith('V')):
#                 if("printer" in word):
#                     print(sent,i)
                verbs2.append(word)
#     v = [word for word, pos in nltk.pos_tag(word_tokenize(text)) if (pos.startswith('V'))]
#     verbs2.update(v)
verbs2 = Counter(verbs2)

In [12]:
def text_variations(txt):
    txt = txt.strip()
    var = []
    if('-' in txt):#V-cube_6
        if('_' in txt):
            var.append(txt)
            txt = txt.replace('_',' ')#V-cube 6
        var.append(txt)
        txt = txt.replace('-','_')#V_cube 6
    if(' ' in txt):
        var.append(txt)
        txt = txt.replace(' ','_')#V_cube_6
    if('_' in txt):
        var.append(txt)
        txt = txt.replace('_',' ')#V cube 6
    var.append(txt)
    return var

def ents_to_dict(ents_list):
    ent_dict = {}
    for ent in ents_list:
        ent = ent.strip()
        if '(' in ent:
#             print(ent,end='\t')
            ent = ent.split('(')[0][:-1]
#             print(ent)
        ent = lemmatizer.lemmatize(ent)
        if(len(ent)<4) or ent in stop_words or ent in verbs or verbs2[ent]>2:
            continue
        ent_dict[ent] = text_variations(ent)
    return ent_dict

In [13]:
te_ents = pickle.load(open('./tecknowbase_entities.pkl','rb'))
te_n_ents = pickle.load(open('./tecknowbase_noisy_entities.pkl','rb'))
db_ents = pickle.load(open('./dbpedia_entities.pkl','rb'))
te_dict = ents_to_dict(te_ents)
te_n_dict = ents_to_dict(te_n_ents)
db_dict = ents_to_dict(db_ents)

In [14]:
te_proc = KeywordProcessor()
te_proc.add_keywords_from_dict(te_dict)
te_n_proc = KeywordProcessor()
te_n_proc.add_keywords_from_dict(te_n_dict)
db_proc = KeywordProcessor()
db_proc.add_keywords_from_dict(db_dict)

In [2]:
def find_entities(proc, page,freq=3):
    counts = Counter(proc.extract_keywords(page))
    freq_ents = [x for x in counts if counts[x] >= freq]
    return set(freq_ents)

def find_linewise_entities(proc, page,freq=2):
    steps = page.split('\n\n')[:-2]
    linewise = [set(proc.extract_keywords(step)) for step in steps]
    counts = Counter([ent for line in linewise for ent in line])
    freq_ents = [x for x in counts if counts[x] >= freq]
    return set(freq_ents)

def merge(list1, list2):
    l = list(list1)
    for ent2 in list2:
        to_add = True
        for i,ent1 in enumerate(list1):
            if(ent1 in ent2):
                l[i]=ent2
            elif(ent2 in ent1):
                to_add = False
        if(to_add):
            l.append(ent2)
    return set(l)

In [16]:
def page_to_steps(page):
    return page.split('\n\n')[:-2]

In [20]:
workbook = xlsxwriter.Workbook('WikiHow_printer_lines_entities.xlsx')
worksheet = workbook.add_worksheet()

In [23]:
Head = ['Task Number','Step Number','Task Name','URL','Step Description','Step-Ent-TeKnowBase','Step-Ent-TeKnowBase-Noisy','Step-Ent-DBPedia','Step-Ent-Merged','Step-Ent-Stepwise','Summary','Description','Page-Entities-TeKnowBase','Page-Entities-TeKnowBase-Noisy','Page-Entities-DBPedia','Page-Entities-merged','Page-Entities-Stepwise']
for n,h in enumerate(Head):
    worksheet.write(0,n,h)
n = 0
for m,page in enumerate(pages):
    print(page)
    g = open('./WikiHow Printer/WikiHow/'+page+'.txt','r',encoding='utf8')
    text = g.read()
    g1 = open('./WikiHow Printer/WikiHowSummary/'+page+'.txt',encoding='utf8')
    summary = g1.read()
    page_te_ent = ", ".join(find_entities(te_proc,text))
    page_te_n_ent = ", ".join(find_entities(te_n_proc,text))
    page_db_ent = ", ".join(find_entities(db_proc,text))
    page_merged_ent = ", ".join(merge(find_entities(te_proc,text),find_entities(db_proc,text)))
    page_merged_linewise = ", ".join(merge(find_linewise_entities(te_proc,text),find_linewise_entities(db_proc,text)))
    steps = page_to_steps(text)
    
    for i,step in enumerate(steps):
#     for i in range(1):
#         step = page
        step_te_ent = ", ".join(find_entities(te_proc,step,0))
        step_te_n_ent = ", ".join(find_entities(te_n_proc,step,0))
        step_db_ent = ", ".join(find_entities(db_proc,step,0))
        step_merged_ent = ", ".join(merge(find_entities(te_proc,step,0),find_entities(db_proc,step,0)))
        step_merged_linewise = ", ".join(merge(find_linewise_entities(te_proc,step,0),find_linewise_entities(db_proc,step,0)))
        worksheet.write(n+1,0,m+1)
        worksheet.write(n+1,1,i+1)
        worksheet.write(n+1,2,page)
        worksheet.write(n+1,3,name_to_link[page])
        worksheet.write(n+1,4,step)
        worksheet.write(n+1,5,step_te_ent)
        worksheet.write(n+1,6,step_te_n_ent)
        worksheet.write(n+1,7,step_db_ent)
        worksheet.write(n+1,8,step_merged_ent)
        worksheet.write(n+1,9,step_merged_linewise)
        worksheet.write(n+1,10,summary)
        worksheet.write(n+1,11,text)
        worksheet.write(n+1,12,page_te_ent)
        worksheet.write(n+1,13,page_te_n_ent)
        worksheet.write(n+1,14,page_db_ent)
        worksheet.write(n+1,15,page_merged_ent)
        worksheet.write(n+1,16,page_merged_linewise)
        n+=1
        
    
    if(m%20==0):
        print(n)

3D-Print-an-Object
17
3D-Print-with-CURA-on-Creality%27s-Ender-3
Activate-Airprint
Add-a-Clone-Printer-for-Black-Only-Draft-Print-Jobs
Add-a-Laser-Printer-to-a-Home-Network
Add-a-Laser-Printer-to-an-Office-Network
Add-a-Local-Printer-in-Linux
Add-a-Network-Printer-in-Windows-XP
Add-a-Printer-on-HP-SureSupply-on-iOS
Add-a-Printer-to-Time-Capsule
Add-a-Printer
Add-an-HP-Printer-to-a-Wireless-Network
Adjust-the-Print-Quality-of-a-Laser-Printer
Align-Your-HP-Printer
Buy-a-Printer
Buy-Cheap-Toner
Buy-Generic-Toner
Buy-Recycled-Toner-Cartridges
Cancel-a-Print-Job-on-PC-or-Mac
Change-Black-Ink-to-Color-on-a-Dell-940-Printer
Check-How-Much-Ink-is-Left-in-an-Inkjet-Printer
277
Check-if-Your-Printer-Has-Run-out-of-Ink
Check-Printer-Ink-Levels-in-Windows
Choose-a-Printer
Choose-an-All-in-one-Printer-for-a-Home-Office
Choose-an-Inkjet-Printer-with-the-Highest-Quality-Photo-Printing
Choose-Multifunction-Printers
Clean-a-Brother-Printer
Clean-a-Laser-Printer
Clean-a-Printer
Clean-an-Inkjet-Printer%2

In [24]:
workbook.close()