## Geração de Datasets RDF para treinamento de sistemas conversacionais

In [1]:
!pip install rdflib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdflib
  Downloading rdflib-6.2.0-py3-none-any.whl (500 kB)
[K     |████████████████████████████████| 500 kB 5.2 MB/s 
[?25hCollecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 330 kB/s 
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.2.0


### Load Dataset


In [7]:
from rdflib import Graph
g = Graph()
g.parse('sample_data/mondial_europe_dataset.ttl', format='ttl', encoding='utf-8')


<Graph identifier=N8613d5a32799404086f3cd5f7adba1c3 (<class 'rdflib.graph.Graph'>)>

### Quantidade de Triplas

In [8]:
print(len(g))

72450


### Ver as 50 primeiras triplas

In [6]:
j = 50
for s, p, o in g:
    print(s,p,o)
    j-=1
    if(j==0):
      break


http://www.semwebtech.org/mondial/10/sources/Kamchatka/ http://www.semwebtech.org/mondial/10/meta#locatedIn http://www.semwebtech.org/mondial/10/countries/R/provinces/Kamchatka/
http://www.semwebtech.org/mondial/10/countries/BG/ http://www.semwebtech.org/mondial/10/meta#isMember http://www.semwebtech.org/mondial/10/organizations/OSCE/
http://www.semwebtech.org/mondial/10/countries/TR/ http://www.semwebtech.org/mondial/10/meta#hadPopulation nf705a3f950614a4ea8cee3a6011eb976b754
nf705a3f950614a4ea8cee3a6011eb976b5466 http://www.semwebtech.org/mondial/10/meta#year 1989
nf705a3f950614a4ea8cee3a6011eb976b9137 http://www.semwebtech.org/mondial/10/meta#type member
http://www.semwebtech.org/mondial/10/countries/IRL/ http://www.semwebtech.org/mondial/10/meta#isMember http://www.semwebtech.org/mondial/10/organizations/ICJ/
nf705a3f950614a4ea8cee3a6011eb976b9622 http://www.semwebtech.org/mondial/10/meta#ofMember http://www.semwebtech.org/mondial/10/countries/GR/
nf705a3f950614a4ea8cee3a6011eb976b

### Importando os prefixos e criação de um novo prefixo para ser utilizado na consulta SPARQL

In [17]:
from rdflib.namespace import RDF, RDFS, XSD, OWL
from rdflib import Namespace

site = Namespace("http://www.semwebtech.org/mondial/10/meta#")

g.bind("owl", OWL)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("mon", site)


### Testando uma consulta SPARQL

In [132]:
#Listar todas as instancias do tipo País e seus respectivos nomes 
sparqlQueries =   {"x": """
     select ?Y ?W   where {?X rdf:type mon:Country; mon:name ?Y;
      mon:hadPopulation ?Z . ?Z mon:year ?W .

       }
    """}
qres = g.query(
   sparqlQueries["x"]
)

for row in qres:
  print(row)

(rdflib.term.Literal('Albania'), rdflib.term.Literal('1950', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#gYear')))
(rdflib.term.Literal('Albania'), rdflib.term.Literal('1960', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#gYear')))
(rdflib.term.Literal('Albania'), rdflib.term.Literal('1970', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#gYear')))
(rdflib.term.Literal('Albania'), rdflib.term.Literal('1980', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#gYear')))
(rdflib.term.Literal('Albania'), rdflib.term.Literal('1990', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#gYear')))
(rdflib.term.Literal('Albania'), rdflib.term.Literal('1997', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#gYear')))
(rdflib.term.Literal('Albania'), rdflib.term.Literal('2000', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#gYear')))
(rdflib.term.Literal('Albania'), rdflib.term.Literal('2001', datatype

### Aplicação

#### Definição: Ordem de geração, pronomes e domínio especificados

In [118]:
#First: generate a SELF_EXPLANATORY utterance and after a not self explanatory utterance
orderToGenerate = [(1,"SE"), (2,"NSE")]
pronoms = ("his/her","your")
domains = ("Country", "Organization", "City")

#### Definição: Template dos enunciados

In [119]:
#All utterances possibles 
dictionaryTemplateFromData = {
    "Country":{
      "government": {"SE": "What is the government of [SUJ]?", "NSE": "What is [PRO] government?"},
      "area": {"SE":"What is the area of [SUJ]?", "NSE":"What is [PRO] area?"},
      "capital":{"SE": "What is the capital of [SUJ]?", "NSE":"What is [PRO] capital?"},
      "carCode" : {"SE": "What is car code of [SUJ]?", "NSE":"What is [PRO] car code?"},
      "infantMortality" : {"SE": "What is infant mortality rate of [SUJ]?", "NSE":"What is [PRO] car code?"},
      "hasCity": {"SE": "[SUJ] has city of [OBJ]?", "NSE":"[OBJ] is [PRO] city?"},
      "hadPopulation": {"SE": "What was the total population in [OBJ] of [SUJ]?", "NSE":"What was [PRO] total population in [OBJ]?"},
      "isMember":{"SE": "[SUJ] is a member of which organization?", "NSE":"What is [PRO] organization?"},
    },
    "City":{
        "locatedIn" : {"SE": "What is location of [SUJ]?", "NSE":"What is [PRO] location?"},
        "isCapital": {"SE": "[SUJ] is capital?", "NSE":"Is a capital?"},
        "checkCapital": {"SE": "[SUJ] is capital of [OBJ]?", "NSE":"Is It capital of [OBJ]?"},
    },
    "Organization":{
        "abbrev" : {"SE": "What is the abbreviation of [SUJ]?", "NSE":"What is [PRO] abbreviation?"},
        "isCompost": {"SE": "What are countries of [SUJ]?", "NSE":"What are [PRO]s countries?"},
        "checkOrganization": {"SE": "[SUJ] has [OBJ]?", "NSE": "Does this organization have [OBJ]?"}
    }
}

#### Definição: Todas as consultas sparql possiveis baseados nos templates

In [133]:
#Seleção de Conteudo
from rdflib.namespace import RDF, RDFS, XSD, OWL
from rdflib import Namespace

site = Namespace("http://www.semwebtech.org/mondial/10/meta#")

g.bind("owl", OWL)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("mon", site)

getCountry = g.query("""select ?Y  where {?X rdf:type mon:Country; mon:name ?Y }""")
getCountryAndYear = g.query(""" select ?Y ?W  where {?X rdf:type mon:Country; mon:name ?Y; 
  mon:hadPopulation ?Z . ?Z mon:year ?W 
 } """)
getCountryAndCity = g.query(""" select ?W ?A  where {?X rdf:type mon:Country; mon:hasCity ?Z; mon:name ?W .  ?Z rdf:type mon:City ; mon:name ?A }""")
getCity = g.query("""select ?Y  where {?X rdf:type mon:City; mon:name ?Y }""")
getCityAndCountry = g.query(""" select ?A ?W  where {?X rdf:type mon:Country; mon:hasCity ?Z; mon:name ?W .  ?Z rdf:type mon:City ; mon:name ?A }""")
getOrganization = g.query("""select ?Y  where {?X rdf:type mon:Organization; mon:name ?Y }""")
getOrganizationAndCountry = g.query("""select ?O ?W  where {?X rdf:type mon:Country; mon:isMember ?Z; mon:name ?W . ?Z rdf:type mon:Organization ; mon:name ?O }""")

sparqlQueries = {
      "government":getCountry,
      "area":getCountry ,
      "capital":getCountry,
      "carCode":getCountry,
      "infantMortality":getCountry,
      "hasCity":getCountryAndCity,
      "hadPopulation":getCountryAndYear,
      "isMember":getCountry,
      "locatedIn":getCity,
      "isCapital": getCity,
      "checkCapital":getCityAndCountry,
      "abbrev":getOrganization,
      "isCompost":getOrganization,
      "checkOrganization":getOrganizationAndCountry
}

#### Seleção de templates

In [134]:
#Seleção de Templates
import random

totalExamples = 4
utterances = []
for i in range(totalExamples):
  paragraph = str(i+1)
  domain = random.choice(domains)
  properties = list(dictionaryTemplateFromData[domain].keys())
  for order in orderToGenerate:
    sentence = str(order[0])
    typeSentence = order[1]
    property_selected =  random.choice(properties)
    utterance = dictionaryTemplateFromData[domain][property_selected][typeSentence]
    utterances.append([paragraph + '.'+sentence, property_selected, utterance])
    properties.remove(property_selected)

print(utterances)



[['1.1', 'hadPopulation', 'What was the total population in [OBJ] of [SUJ]?'], ['1.2', 'area', 'What is [PRO] area?'], ['2.1', 'checkCapital', '[SUJ] is capital of [OBJ]?'], ['2.2', 'locatedIn', 'What is [PRO] location?'], ['3.1', 'isCapital', '[SUJ] is capital?'], ['3.2', 'locatedIn', 'What is [PRO] location?'], ['4.1', 'locatedIn', 'What is location of [SUJ]?'], ['4.2', 'isCapital', 'Is a capital?']]


#### Seleção de Conteudo

In [135]:


resultSparql = []
finalUtterances = []
codes = []
for (code, property_selected, utterance) in utterances:
  rows = sparqlQueries[property_selected]
  result = [row for row in rows]
  resultSparql.append(random.choice(result))
  finalUtterances.append(utterance)
  codes.append(code)
  
print(resultSparql)

[(rdflib.term.Literal('Finland'), rdflib.term.Literal('2017', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#gYear'))), (rdflib.term.Literal('Serbia'),), (rdflib.term.Literal('Nancy'), rdflib.term.Literal('France')), (rdflib.term.Literal('Akisar'),), (rdflib.term.Literal('Rochdale'),), (rdflib.term.Literal('Amersfoort'),), (rdflib.term.Literal('Konya'),), (rdflib.term.Literal('Bielsko-BiaÅ‚a'),)]


####Lexicalização e Geração de Expressão

In [137]:
#Lexicalização e Geração de Expressão
finalUtterances = finalUtterances
resultSparql = resultSparql
codes = codes
result = []
for i in range(len(finalUtterances)):
  t = finalUtterances[i]
  r = resultSparql[i]
  c = codes[i]
  label = "PREVIOUS TOPIC"
  if("[SUJ]" in t):
    label = "SELF EXPLANATORY"
    t = t.replace("[SUJ]", str(r[0]))

  if("[OBJ]" in t):
    t = t.replace("[OBJ]", str(r[1]))

  if("[PRO]" in t):
    if(i==1):
      label = "FIRST TOPIC"
      
    pro = random.choice(pronoms)
    t = t.replace("[PRO]", pro)

  result.append((c, t, label))

print(result)

[('1.1', 'What was the total population in 2017 of Finland?', 'SELF EXPLANATORY'), ('1.2', 'What is your area?', 'FIRST TOPIC'), ('2.1', 'Nancy is capital of France?', 'SELF EXPLANATORY'), ('2.2', 'What is your location?', 'PREVIOUS TOPIC'), ('3.1', 'Rochdale is capital?', 'SELF EXPLANATORY'), ('3.2', 'What is his/her location?', 'PREVIOUS TOPIC'), ('4.1', 'What is location of Konya?', 'SELF EXPLANATORY'), ('4.2', 'Is a capital?', 'PREVIOUS TOPIC')]


### Testes

In [40]:
#TESTS

d = {'Name': 'Zabra', 'Age': 7, 'Year': 1237, "Surname":"JR"}
row = [("Albania",)]
print(d.keys())
txt = "What is the government of [SUJ]?"
print("[SUJ]" in txt)
print(txt.replace("[SUJ]", row[0][0]))
import random
data = random.choice(list(d.keys()))
pronoms = ("his/her","your")
data = random.choice(pronoms)
print(data)

dict_keys(['Name', 'Age', 'Year', 'Surname'])
True
What is the government of Albania?
his/her


In [49]:
#TESTS

pronoms = ("his/her","your")
utterances = ["What is the government of [SUJ]?", "What was [PRO] total population in [OBJ]?", "What was the total population in [OBJ] of [SUJ]?"]
rows = [("Albania",), ("Albania", "1991"), ("Alemanha", "1950")]
result = []
for i in range(len(txt)):
  t = utterances[i]
  r = rows[i]
 
  if("[SUJ]" in t):
    label = "SELF EXPLANATORY"
    t = t.replace("[SUJ]", str(r[0]))

  if("[OBJ]" in t):
    t = t.replace("[OBJ]", str(r[1]))

  if("[PRO]" in t):
    if(i==1):
      label = "FIRST TOPIC"
    else:
      label = "PREVIOUS TOPIC"
    pro = random.choice(pronoms)
    t = t.replace("[PRO]", pro)

  result.append((t, label))

print(result)


[('What is the government of Albania?', 'SELF EXPLANATORY'), ('What was your total population in 1991?', 'FIRST TOPIC'), ('What was the total population in 1950 of Alemanha?', 'SELF EXPLANATORY')]
