In [1]:
import pandas as pd
import bz2
import csv

import os
import sys
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [7]:
# Parameters

input_path = "/Volumes/saggu-ssd/wikidata-dwd-v3"
output_path = "/Volumes/saggu-ssd/wikidata-dwd-v3"
kgtk_path = "/Users/amandeep/Github/kgtk"

graph_cache_path = None


project_name = "dbpedia-abstracts"
files = 'sitelinks'
debug=True

In [8]:
files = files.split(",")

In [9]:
ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name,
                 graph_cache_path=graph_cache_path)

User home: /Users/amandeep
Current dir: /Users/amandeep/Github/kgtk-browser
KGTK dir: /Users/amandeep/Github/kgtk
Use-cases dir: /Users/amandeep/Github/kgtk/use-cases


In [10]:
ck.print_env_variables()

KGTK_GRAPH_CACHE: /Volumes/saggu-ssd/wikidata-dwd-v3/dbpedia-abstracts/temp.dbpedia-abstracts/wikidata.sqlite3.db
USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases
STORE: /Volumes/saggu-ssd/wikidata-dwd-v3/dbpedia-abstracts/temp.dbpedia-abstracts/wikidata.sqlite3.db
KGTK_LABEL_FILE: /Volumes/saggu-ssd/wikidata-dwd-v3/labels.en.tsv.gz
kgtk: kgtk
TEMP: /Volumes/saggu-ssd/wikidata-dwd-v3/dbpedia-abstracts/temp.dbpedia-abstracts
EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples
kypher: kgtk query --graph-cache /Volumes/saggu-ssd/wikidata-dwd-v3/dbpedia-abstracts/temp.dbpedia-abstracts/wikidata.sqlite3.db
KGTK_OPTION_DEBUG: false
OUT: /Volumes/saggu-ssd/wikidata-dwd-v3/dbpedia-abstracts
GRAPH: /Volumes/saggu-ssd/wikidata-dwd-v3
sitelinks: /Volumes/saggu-ssd/wikidata-dwd-v3/sitelinks.tsv.gz


In [11]:
if graph_cache_path is None:
    ck.load_files_into_cache()

kgtk query --graph-cache /Volumes/saggu-ssd/wikidata-dwd-v3/dbpedia-abstracts/temp.dbpedia-abstracts/wikidata.sqlite3.db -i "/Volumes/saggu-ssd/wikidata-dwd-v3/sitelinks.tsv.gz" --as sitelinks  --limit 3
id	node1	label	node2	lang	rank	node2;wikidatatype
Q1-addl_wikipedia_sitelink-19e42a-0	Q1	addl_wikipedia_sitelink	http://enwikiquote.org/wiki/Universe	en		
Q1-addl_wikipedia_sitelink-1a91c7-0	Q1	addl_wikipedia_sitelink	http://itwikibooks.org/wiki/Universo	it		
Q1-addl_wikipedia_sitelink-1c8e7b-0	Q1	addl_wikipedia_sitelink	http://ptwikiquote.org/wiki/Universo	pt		


### Converts `en` dbpedia urls to `en` wikipedia urls

In [14]:
def convert_db_to_wiki_url(db_url: str, kgtk_format=True):
    # val = db_url.split('/')[-1]
    val = db_url[28:]
    if kgtk_format:
            return f"http://en.wikipedia.org/wiki/{val}"
    return f"https://en.wikipedia.org/wiki/{val}"

### Parse DBPedia abstracts ttl quad to JSON

Sample ttl quad
```
<http://dbpedia.org/resource/!!!> <http://dbpedia.org/ontology/abstract> "!!! (/tʃ(ɪ)k.tʃ(ɪ)k.tʃ(ɪ)k/ ch(i)k-ch(i)k-ch(i)k), also known as Chk Chk Chk, is an American rock band from Sacramento, California, formed in 1996 by lead singer Nic Offer. Members of !!! came from other local bands such as the Yah Mos, Black Liquorice and Pope Smashers. They are currently based in New York City. The band's eighth album, Wallop, was released in August 2019."@en .
```

In [16]:
def parse_dbpedia_abstract_ttl(db_string_s):
    db_uri = ''
    abstract = ''
    token_counter = 1
    accumulate = ''
    uri_started = False
    abstract_started = False
    db_string = db_string_s.replace('\\\\', '')
                                  
    for i in range(len(db_string)):
        c = db_string[i]
        if c == '<' and not abstract_started:
            uri_started = True
            continue

        if c == '>' and not abstract_started:
            uri_started = False
            if token_counter == 1:
                db_uri = accumulate
            accumulate = ''
            token_counter += 1
        if c == '"' and not abstract_started:
            abstract_started = True
            continue
        if c == '"' and abstract_started and (i > 0 and db_string[i-1] != "\\"):
            abstract_started = False
            if token_counter == 3:
                abstract = accumulate
            accumulate = ''
            token_counter += 1
        if uri_started or abstract_started:
            accumulate += c
    if len(abstract) < 5:
        print(db_uri, abstract, convert_db_to_wiki_url(db_uri))

    return {'node1': convert_db_to_wiki_url(db_uri), 'label': 'abstract',
            'node2': f'"{abstract}"@en'} if db_uri != '' else None


In [17]:
def process(i_file, o_file):
    f = bz2.open(i_file, mode='rt')
    r = []
    count = 1
    for line in f:
        if count % 100000 == 0:
            print(count)
        r.append(parse_dbpedia_abstract_ttl(line))
        count += 1
    print(f'Total lines: {count}')
    df = pd.DataFrame(r)
    df.to_csv(o_file, sep='\t', index=False, quoting=csv.QUOTE_NONE)

### Download dbpedial long abstracts file from https://databus.dbpedia.org/dbpedia/text/long-abstracts/

In [18]:
input_dbpedia_ttl_file = 'dbpedia_data/long-abstracts_lang=en.ttl.bz2'
input_dbpedia_short_abstracts = 'dbpedia_data/short-abstracts_lang=en.ttl.bz2'
output_file = 'dbpedia_data/long_abstracts.tsv.gz'
output_short_abstracts = 'dbpedia_data/short_abstracts.tsv.gz'

In [21]:
process(input_dbpedia_ttl_file, output_file)

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
http://dbpedia.org/resource/Cher_Special_Gigs . http://en.wikipedia.org/wiki/Cher_Special_Gigs
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
http://dbpedia.org/resource/Jonathon_W._G._Wills | http://en.wikipedia.org/wiki/Jonathon_W._G._Wills
2900000
3000000
3100000
3200000
3300000
http://dbpedia.org/resource/List_of_World_War_II_firearms_of_Germany . http://en.wikipedia.org/wiki/List_of_World_War_II_firearms_of_Germany
http://dbpedia.org/resource/List_of_educational_institutions_in_Palakkad_district . http://en.wikipedia.org/wiki/List_of_educational_institutions_in_Palakkad_district
http://dbpedia.org/resource/List_of_royal_consorts_of_Haiti ] http://en.wikipedia.org/wiki/List_of_royal_consorts_of_Haiti
http://dbpedia.org/resource/List_of_rulers_of_the_Ngoni_Dynasty_of_Jere_(Qeko)  http://en.wikipedia.org/wiki/List_of_r

In [22]:
!gzcat dbpedia_data/long_abstracts.tsv.gz | head -10

node1	label	node2
http://en.wikipedia.org/wiki/!!!	abstract	"!!! (/tʃ(ɪ)k.tʃ(ɪ)k.tʃ(ɪ)k/ ch(i)k-ch(i)k-ch(i)k), also known as Chk Chk Chk, is an American rock band from Sacramento, California, formed in 1996 by lead singer Nic Offer. Members of !!! came from other local bands such as the Yah Mos, Black Liquorice and Pope Smashers. They are currently based in New York City. The band's eighth album, Wallop, was released in August 2019."@en
http://en.wikipedia.org/wiki/!!!_(album)	abstract	"!!! is the eponymous debut studio album by the dance-punk band of the same name. It was released in 2000 on Gold Standard Laboratories on vinyl, and saw wide release on CD on 19 June 2001."@en
http://en.wikipedia.org/wiki/!Action_Pact!	abstract	"!Action Pact! was a London-based punk rock band, formed in 1981 by guitarist Wild Planet, bassist , , and drummer . They would later break up in 1986."@en
http://en.wikipedia.org/wiki/!Arriba!_La_Pachanga	abstract	"!Arriba! La Pachanga is an album by Mongo Sant

In [19]:
process(input_dbpedia_short_abstracts, output_short_abstracts)

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
http://dbpedia.org/resource/Cher_Special_Gigs . http://en.wikipedia.org/wiki/Cher_Special_Gigs
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
http://dbpedia.org/resource/Jonathon_W._G._Wills | http://en.wikipedia.org/wiki/Jonathon_W._G._Wills
2900000
3000000
3100000
3200000
3300000
http://dbpedia.org/resource/List_of_World_War_II_firearms_of_Germany . http://en.wikipedia.org/wiki/List_of_World_War_II_firearms_of_Germany
http://dbpedia.org/resource/List_of_educational_institutions_in_Palakkad_district . http://en.wikipedia.org/wiki/List_of_educational_institutions_in_Palakkad_district
http://dbpedia.org/resource/List_of_royal_consorts_of_Haiti ] http://en.wikipedia.org/wiki/List_of_royal_consorts_of_Haiti
http://dbpedia.org/resource/List_of_rulers_of_the_Ngoni_Dynasty_of_Jere_(Qeko)  http://en.wikipedia.org/wiki/List_of_r

In [20]:
!gzcat dbpedia_data/short_abstracts.tsv.gz | head -10

node1	label	node2
http://en.wikipedia.org/wiki/!!!	abstract	"!!! (/tʃ(ɪ)k.tʃ(ɪ)k.tʃ(ɪ)k/ ch(i)k-ch(i)k-ch(i)k), also known as Chk Chk Chk, is an American rock band from Sacramento, California, formed in 1996 by lead singer Nic Offer. Members of !!! came from other local bands such as the Yah Mos, Black Liquorice and Pope Smashers. They are currently based in New York City. The band's eighth album, Wallop, was released in August 2019."@en
http://en.wikipedia.org/wiki/!!!_(album)	abstract	"!!! is the eponymous debut studio album by the dance-punk band of the same name. It was released in 2000 on Gold Standard Laboratories on vinyl, and saw wide release on CD on 19 June 2001."@en
http://en.wikipedia.org/wiki/!Action_Pact!	abstract	"!Action Pact! was a London-based punk rock band, formed in 1981 by guitarist Wild Planet, bassist , , and drummer . They would later break up in 1986."@en
http://en.wikipedia.org/wiki/!Arriba!_La_Pachanga	abstract	"!Arriba! La Pachanga is an album by Mongo Sant

In [23]:
!kgtk add-id --id-style wikidata -i dbpedia_data/long_abstracts.tsv.gz -o dbpedia_data/long_abstracts_ids.tsv.gz

In [21]:
!kgtk add-id --id-style wikidata -i dbpedia_data/short_abstracts.tsv.gz -o dbpedia_data/short_abstracts_ids.tsv.gz

In [31]:
!kgtk query --gc $STORE \
    -i sitelinks -i dbpedia_data/long_abstracts_ids.tsv.gz \
    --match 'sitelinks: (n1)-[l:wikipedia_sitelink]->(n2), abstracts: (n2)-[]->(abstract)' \
    --return 'n1 as node1, "Plong_abstract" as label, abstract as node2' \
    -o $OUT/downloaded.wikipedia.long_abstracts.tsv.gz

In [22]:
!kgtk query --gc $STORE \
    -i sitelinks -i dbpedia_data/short_abstracts_ids.tsv.gz \
    --match 'sitelinks: (n1)-[l:wikipedia_sitelink]->(n2), abstracts: (n2)-[]->(abstract)' \
    --return 'n1 as node1, "Pshort_abstract" as label, abstract as node2' \
    -o $OUT/downloaded.wikipedia.short_abstracts.tsv.gz

In [23]:
def analyze(abstract_file):
    d = {}
    f = gzip.open(abstract_file, 'rt')
    for line in f:
        vals = line.strip().split('\t')
        node1 = vals[0]
        node2 = vals[2]
        if node1 not in d:
            d[node1] = list()
        d[node1].append(node2)
    for qnode in d:
        if len(d[qnode]) > 1:
            print(qnode, d[qnode])
    print('done')

In [27]:
import gzip

In [28]:
analyze(f"{os.environ['OUT']}/downloaded.wikipedia.short_abstracts.tsv.gz")

Q7948577 ['"WDSE may refer to: \\n* WDSE (TV), a PBS member station in Duluth, Minnesota, United States \\n* WDSE-FM, an adult album alternative radio station in Duluth, Minnesota, United States This disambiguation page lists articles about radio and/or television stations with the same/similar call signs or branding.If an internal link led you here, you may wish to change the link to point directly to the intended article."@en', '"WDSE, virtual and VHF digital channel 8, is a Public Broadcasting Service (PBS) member television station licensed to Duluth, Minnesota, United States, serving northeastern Minnesota, northwestern Wisconsin, and the far western portion of the Upper Peninsula of Michigan. Owned by the Duluth–Superior Area Educational Television Corporation, it is sister to adult album alternative radio station WDSE-FM (103.3). The two outlets share studios on rented space at the University of Minnesota Duluth; the television station\'s transmitter is located west of downtown 