In [28]:
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_colwidth', 1000)

In [2]:
def analyze(filename):
    df = pd.read_csv(filename, header=-1)
    df.columns = ['pred', 'label'] # Distribution of templates
    
    for id in df['label'].value_counts().index:
        subset = df[df['label'] == id]
        correct = (subset['pred'] == subset['label']).sum()
        accuracy = correct / len(subset)

        false_positive_subset = df[df['pred'] == id]
        false_positives = (false_positive_subset['pred'] != false_positive_subset['label']).sum()
        false_positive_rate = false_positives / max(len(false_positive_subset), 1)

        print("=" * 50)
        print("Template #{}: count={}, correct={}, false_positives={}, accuracy={}, false_positive_rate={}"
              .format(id, len(subset), correct, false_positives, accuracy, false_positive_rate))
        print("=" * 50)
    return df

In [23]:
df = analyze("analysis/input_dim=85,mem_dim=150,epochs=15,current_epoch=6,test_acc=0.754.csv")

Template #1: count=261, correct=205, false_positives=35, accuracy=0.7854406130268199, false_positive_rate=0.14583333333333334
Template #5: count=205, correct=155, false_positives=68, accuracy=0.7560975609756098, false_positive_rate=0.30493273542600896
Template #16: count=104, correct=85, false_positives=8, accuracy=0.8173076923076923, false_positive_rate=0.08602150537634409
Template #8: count=79, correct=66, false_positives=11, accuracy=0.8354430379746836, false_positive_rate=0.14285714285714285
Template #151: count=73, correct=70, false_positives=7, accuracy=0.958904109589041, false_positive_rate=0.09090909090909091
Template #3: count=69, correct=30, false_positives=18, accuracy=0.43478260869565216, false_positive_rate=0.375
Template #105: count=47, correct=37, false_positives=13, accuracy=0.7872340425531915, false_positive_rate=0.26
Template #15: count=40, correct=25, false_positives=8, accuracy=0.625, false_positive_rate=0.24242424242424243
Template #7: count=26, correct=24, false_p

In [27]:
template = 5
df[ (df['pred'] == template) & (df['label'] != template) ]['label'].value_counts()

1      33
3      24
106     3
16      2
111     1
105     1
102     1
101     1
151     1
8       1
Name: label, dtype: int64

In [16]:
df = pd.read_csv("analysis/pos * rels/input_dim=300,mem_dim=150,epochs=15,current_epoch=15,train_acc=0.90625.csv", header=-1)

In [17]:
df.columns = ['pred', 'label'] # Distribution of templates

In [19]:
df['label'].value_counts(normalize=True)

2      0.14925
305    0.11550
16     0.10225
308    0.06675
301    0.06225
3      0.05200
5      0.03950
15     0.03775
152    0.03625
151    0.03500
306    0.03475
105    0.03450
1      0.03225
303    0.02500
405    0.01925
6      0.01925
311    0.01750
401    0.01625
101    0.01550
111    0.01550
406    0.01325
307    0.01325
7      0.01250
8      0.00775
106    0.00450
102    0.00400
11     0.00325
403    0.00275
108    0.00275
103    0.00250
315    0.00200
402    0.00175
107    0.00125
316    0.00125
605    0.00050
906    0.00025
9      0.00025
Name: label, dtype: float64

In [4]:
df_train = pd.read_json("data/lc-quad/train-data.json")
df_test = pd.read_json("data/lc-quad/test-data.json")
df = pd.concat([df_train, df_test], ignore_index = True)

In [7]:
sum = 0
index = 0
columns = df['sparql_template_id'].value_counts(normalize=True)
for percent in columns:
    sum += percent
    index += 1
    if sum >= 0.8:
        break
        
df2 = df[df['sparql_template_id'].isin(columns.index[:7].tolist())]

In [8]:
df2['sparql_template_id'].value_counts(normalize=True)

2      0.253302
305    0.190992
16     0.177108
308    0.113105
301    0.104639
3      0.088723
5      0.072130
Name: sparql_template_id, dtype: float64

In [44]:
df2[df2['sparql_template_id'] == 5]['corrected_question'].head(n=30)

8                                                                 Starwood operates in which places?
23                                          Which company owns the manufacturer of the Edsel Ranger?
51                            In which teams did anyone who played as a defencemen, play previously?
54                                     Who authored the works published in the Jump Square Magazine?
101                                                    What awards have been given to Screenwriters?
191                                                     Where do employees of Deroit Red Wings live?
228                       For which other teams do members of tanzanian national football team play?
275                                                  UEM group have made bridges crossing over what?
277                                                                  Where do bachelor of arts live?
337                     What does the famous relative of Levon Ashotovich Grigorian do for 

In [41]:
df2[df2['sparql_template_id'] == 305]['corrected_question'].head(n=30)

7                                      What layout can be found in cars similar to the Subaru Outback?
14                                     List the things for which people of New Orleans are famous for?
15                                            What sports are played in schools in the capital region?
18                          which awards have been given to people who fought in the Battle of France?
20                          Who built the stadium which was rented for the 2013 Copa Centroamericana ?
31                                                 Which genre of books are published by Random House?
35                                                     What are some Louisiana based models known for?
40     What is the religion of the member of parliament who is a relative of Sarathchandra Rajakaruna?
46                                   What are some other products of the banks which makes Postbanken?
48                                                        Who produces th

In [38]:
df2[df2['sparql_template_id'] == 305]['sparql_query'].head(n=20)

7                           SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/related> <http://dbpedia.org/resource/Subaru_Outback> . ?x <http://dbpedia.org/ontology/layout> ?uri  . ?x <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Automobile>}
14                          SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/placeOfBirth> <http://dbpedia.org/resource/New_Orleans> . ?x <http://dbpedia.org/property/knownFor> ?uri  . ?x <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Person>}
15                                SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/region> <http://dbpedia.org/resource/Capital_region> . ?x <http://dbpedia.org/ontology/sport> ?uri  . ?x <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/School>}
18                              SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/ontology/battle> <http://dbpedia.org/resource/Battle

In [39]:
df2[df2['sparql_template_id'] == 5]['sparql_query'].head(n=20)

8                                 SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/operator> <http://dbpedia.org/resource/Starwood_Hotels_and_Resorts_Worldwide> . ?x <http://dbpedia.org/property/address> ?uri  . }
23                                                          SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/products> <http://dbpedia.org/resource/Edsel_Ranger> . ?x <http://dbpedia.org/property/parent> ?uri  . }
51                                                        SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/position> <http://dbpedia.org/resource/Defenceman> . ?x <http://dbpedia.org/ontology/formerTeam> ?uri  . }
54                                                           SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/magazine> <http://dbpedia.org/resource/Jump_Square> . ?x <http://dbpedia.org/property/author> ?uri  . }
101                                                       SELECT DISTINCT ?uri WHERE { ?x <http:

In [31]:
data_dir = os.path.join('./', 'data')
lc_quad_dir = os.path.join(data_dir, 'lc-quad')
df_train = pd.read_json(os.path.join(lc_quad_dir, "train-data.json"))
df_test = pd.read_json(os.path.join(lc_quad_dir, "test-data.json"))
df = pd.concat([df_train, df_test], ignore_index = True)

In [34]:
df[df['sparql_template_id'].isin([306, 6, 305, 5, 303, 3, 311, 11])]['sparql_template_id'].value_counts()

305    564
3      262
5      213
306    175
303    115
6       94
311     76
11      20
Name: sparql_template_id, dtype: int64