In [1]:
import pandas as pd
import sqlite3
import json
import pickle

import ast
import wikidata_utils as wdutils

# Setting up data access

In [2]:
#db = sqlite3.connect('../wikidata_claims_refs_parsed.db')
#cursor = db.cursor()
claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue']

In [3]:
Wd_API = wdutils.CachedWikidataAPI()

In [4]:
with open('data/final_entity_list.p','rb') as f:
    final_entity_list = pickle.load(f)

In [5]:
root_entity_labels = {
    'Q1248784':'Airport',
    'Q11631':'Astronaut',
    'Q41176':'Building',
    'Q515':'City',
    'Q1114461':'ComicsCharacter',
    'Q2095':'Food',
    'Q4989906':'Monument',
    'Q12973014':'SportsTeam',
    'Q3918':'University',
    'Q47461344':'WrittenWork',
    'Q2066131':'Athlete',
    'Q483501':'Artist',
    'Q6999':'CelestialBody',
    'Q334166':'MeanOfTransportation',
    'Q82955':'Politician',
    'Q16521':'Taxon',
    'Q79007':'Street',
    'Q3305213':'Painting',
    'Q11173':'ChemicalCompound',
    'Q8502':'Mountain'
}

# Getting a dataframe of suitable claims for each theme

Suitable claims would be:
1. Not deprecated
2. Not link to external IDs
3. Not link to URLs
4. Not including P31 and P279 predicated (or other predicates later noticed to be not suited for verbalisation.
5. Not too niche (very few uses in this theme)

## Dealing with points 1 to 4

In [37]:
properties_to_remove = {
    'general':[
        'P31', # - instance of
        'P279',# - subclass of
        'P373',# - commons category
        'P910',# - Topic's main category
        'P7561',# - category for the interior of the item
        'P5008',# - on focus list of Wikimedia project
        'P2670',# -  has parts of the class
        'P1740',# -  category for films shot at this location
        'P1612',# -  Commons Institution page
        'P8989',# -  category for the view of the item
        'P2959',# -  permanent duplicated item
        'P7867',# -  category for maps
        'P935' ,# -  Commons gallery
        'P1472',#  -  Commons Creator page
        'P8596',# category for the exterior of the item
        'P5105',# Deutsche Bahn station category
        'P8933',# category for the view from the item
        'P642',# of
        'P3876',# category for alumni of educational institution
        'P1791',# category of people buried here
        'P7084',# related category
        'P1465',# category for people who died here
        'P1687',# Wikidata property
        'P6104',# maintained by WikiProject
        'P4195',# category for employees of the organization
        'P1792',# category of associated people
        'P5869',# model item
        'P1659',# see also
        'P1464',# category for people born here
        'P2354',# has list
        'P1424',# topic's main template
        'P7782',# category for ship name
        'P179',# part of the series
        'P7888',# merged into
        'P6365',# member category
        'P8464',# content partnership category
        'P360',# is a list of
        'P805',# statement is subject of
        'P8703',# entry in abbreviations table
        'P1456',# list of monuments
        'P1012',# including
        'P1151',# topic's main Wikimedia portal
        'P2490',# page at OSTIS Belarus Wiki
        'P593',# HomoloGene ID
        'P8744',# economy of topic
        'P2614',# World Heritage criteria
        'P2184',# history of topic
        'P9241',# demographics of topic
        'P487',#Unicode character
        'P1754',#category related to list
        'P2559',#Wikidata usage instructions
        'P2517',#category for recipients of this award
        'P971',#category combines topics
        'P6112',# category for members of a team
        'P4224',#category contains
        'P301',#category's main topic
        'P1753',#list related to category
        'P1423',#template has topic
        'P1204',#Wikimedia portal's main topic
        'P3921',#Wikidata SPARQL query equivalent
        'P1963',#properties for this type
        'P5125',#Wikimedia outline
        
    ],
    'specific': {}
}

for theme in root_entity_labels.keys():
    properties_to_remove['specific'][theme] = []
    
## AIRPORT
#properties_to_remove['specific']['Q1248784'] = [
#    'P585',# -  point in time
#    'P1545',#  -  series ordinal
#]

# AUSTRONAUT
properties_to_remove['specific']['Q11631'] = [
    #'P8345',#  -  media franchise
    #'P2563',#  -  superhuman feature or ability
    #'P2546',#  -  sidekick of
    #'P144',# -  based on
    #'P7047',#  -  enemy of
    #'P577',# -  publication date
    #'P941',# -  inspired by
    #'P4584',#  -  first appearance
    #'P5800',#  -  narrative role
    #'P1080',#  -  from narrative universe
    #'P175',# -  performer
    #'P170',# -  creator
    #'P767',#  -  contributor to the creative work or subject
    'P598',#commander of (DEPRECATED)
]



In [39]:
#i = 19
db = sqlite3.connect('../wikidata_claims_refs_parsed.db')
cursor = db.cursor()

theme_dfs = {}

try:
    for theme, theme_label in list(root_entity_labels.items())[:]:

        print('Processing',theme_label)

        sql_query = "select * from claims where entity_id in $1;"
        sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in final_entity_list[theme]]) + ')')


        cursor.execute(sql_query)
        theme_df = pd.DataFrame(cursor.fetchall())
        theme_df.columns = claims_columns

        print('-    Removing deprecated')

        # Remove deprecated
        theme_df = theme_df[theme_df['rank'] != 'deprecated'].reset_index(drop=True)

        print('-    Removing bad datatypes')

        # Remove external_ids, commonsMedia (e.g. photos), globe-coordinates, urls
        theme_df = theme_df[
            theme_df['datatype'].apply(
                lambda x : x not in ['commonsMedia','external-id','globe-coordinate','url']
            )
        ].reset_index(drop=True)


        print('-    Removing bad properties')

        # Remove specific properties such as P31 and P279
        theme_df = theme_df[
            theme_df['property_id'].apply(
                lambda x : (x not in properties_to_remove['general']) and (x not in properties_to_remove['specific'][theme])
            )
        ].reset_index(drop=True)
        
        theme_dfs[theme] = theme_df
except Exception as e:
    raise e
finally:
    db.close()

Processing Airport
-    Removing deprecated
-    Removing bad datatypes
-    Removing bad properties
Processing Astronaut
-    Removing deprecated
-    Removing bad datatypes
-    Removing bad properties
Processing Building
-    Removing deprecated
-    Removing bad datatypes
-    Removing bad properties
Processing City
-    Removing deprecated
-    Removing bad datatypes
-    Removing bad properties
Processing ComicsCharacter
-    Removing deprecated
-    Removing bad datatypes
-    Removing bad properties
Processing Food
-    Removing deprecated
-    Removing bad datatypes
-    Removing bad properties
Processing Monument
-    Removing deprecated
-    Removing bad datatypes
-    Removing bad properties
Processing SportsTeam
-    Removing deprecated
-    Removing bad datatypes
-    Removing bad properties
Processing University
-    Removing deprecated
-    Removing bad datatypes
-    Removing bad properties
Processing WrittenWork
-    Removing deprecated
-    Removing bad datatypes
-  

In [46]:
predicate_theme_dfs = {}

for theme, theme_df in theme_dfs.items():
    
    predicate_theme_df = theme_df.groupby('property_id').count()[['entity_id']]\
    .sort_values('entity_id', ascending=False).reset_index()
    
    predicate_theme_df.columns = ['property_id','frequency_count']
    predicate_theme_df['property_label'] = predicate_theme_df['property_id'].apply(Wd_API.get_label)
    predicate_theme_df['frequency_percentage'] = predicate_theme_df['frequency_count'].\
        apply(lambda x: x/theme_df.shape[0]*100)

    predicate_theme_dfs[theme] = predicate_theme_df

In [48]:
for theme in theme_dfs.keys():
    theme_dfs[theme].to_csv('./data/theme_dfs/'+ theme + '_claim_theme_df.csv', index=False)
    predicate_theme_dfs[theme].to_csv('./data/theme_dfs/'+ theme + '_predicate_theme_df.csv', index=False)

## Dealing with point 5 (excluding rare predicates)

**Computation can be initated direcly from here**