In [51]:
import pandas as pd
import numpy as np
import os
import nltk
pd.set_option('display.max_colwidth', 1000)

In [2]:
def analyze(filename):
    df = pd.read_csv(filename, header=-1)
    df.columns = ['pred', 'label'] # Distribution of templates
    
    for id in df['label'].value_counts().index:
        subset = df[df['label'] == id]
        correct = (subset['pred'] == subset['label']).sum()
        accuracy = correct / len(subset)

        false_positive_subset = df[df['pred'] == id]
        false_positives = (false_positive_subset['pred'] != false_positive_subset['label']).sum()
        false_positive_rate = false_positives / max(len(false_positive_subset), 1)

        print("=" * 50)
        print("Template #{}: count={}, correct={}, false_positives={}, accuracy={}, false_positive_rate={}"
              .format(id, len(subset), correct, false_positives, accuracy, false_positive_rate))
        print("=" * 50)
    return df

In [56]:
df = analyze("analysis/input_dim=85,mem_dim=150,epochs=15,current_epoch=1,test_acc=0.658.csv")

Template #1: count=261, correct=189, false_positives=43, accuracy=0.7241379310344828, false_positive_rate=0.1853448275862069
Template #3: count=223, correct=181, false_positives=126, accuracy=0.8116591928251121, false_positive_rate=0.41042345276872966
Template #16: count=104, correct=77, false_positives=46, accuracy=0.7403846153846154, false_positive_rate=0.37398373983739835
Template #8: count=79, correct=48, false_positives=12, accuracy=0.6075949367088608, false_positive_rate=0.2
Template #151: count=73, correct=66, false_positives=12, accuracy=0.9041095890410958, false_positive_rate=0.15384615384615385
Template #105: count=70, correct=59, false_positives=44, accuracy=0.8428571428571429, false_positive_rate=0.42718446601941745
Template #6: count=51, correct=4, false_positives=5, accuracy=0.0784313725490196, false_positive_rate=0.5555555555555556
Template #15: count=40, correct=22, false_positives=34, accuracy=0.55, false_positive_rate=0.6071428571428571
Template #7: count=26, correct=

In [57]:
template = 3
df[ (df['pred'] == template) & (df['label'] != template) ]['label'].value_counts()

1      48
6      36
11     11
16      9
105     6
111     3
102     3
101     3
151     3
8       2
103     1
7       1
Name: label, dtype: int64

In [16]:
df = pd.read_csv("analysis/pos * rels/input_dim=300,mem_dim=150,epochs=15,current_epoch=15,train_acc=0.90625.csv", header=-1)

In [17]:
df.columns = ['pred', 'label'] # Distribution of templates

In [19]:
df['label'].value_counts(normalize=True)

2      0.14925
305    0.11550
16     0.10225
308    0.06675
301    0.06225
3      0.05200
5      0.03950
15     0.03775
152    0.03625
151    0.03500
306    0.03475
105    0.03450
1      0.03225
303    0.02500
405    0.01925
6      0.01925
311    0.01750
401    0.01625
101    0.01550
111    0.01550
406    0.01325
307    0.01325
7      0.01250
8      0.00775
106    0.00450
102    0.00400
11     0.00325
403    0.00275
108    0.00275
103    0.00250
315    0.00200
402    0.00175
107    0.00125
316    0.00125
605    0.00050
906    0.00025
9      0.00025
Name: label, dtype: float64

In [77]:
df_train = pd.read_json("data/lc-quad/train-data.json")
df_test = pd.read_json("data/lc-quad/test-data.json")
df = pd.concat([df_train, df_test], ignore_index = True)

In [7]:
sum = 0
index = 0
columns = df['sparql_template_id'].value_counts(normalize=True)
for percent in columns:
    sum += percent
    index += 1
    if sum >= 0.8:
        break
        
df2 = df[df['sparql_template_id'].isin(columns.index[:7].tolist())]

In [8]:
df2['sparql_template_id'].value_counts(normalize=True)

2      0.253302
305    0.190992
16     0.177108
308    0.113105
301    0.104639
3      0.088723
5      0.072130
Name: sparql_template_id, dtype: float64

In [44]:
df2[df2['sparql_template_id'] == 5]['corrected_question'].head(n=30)

8                                                                 Starwood operates in which places?
23                                          Which company owns the manufacturer of the Edsel Ranger?
51                            In which teams did anyone who played as a defencemen, play previously?
54                                     Who authored the works published in the Jump Square Magazine?
101                                                    What awards have been given to Screenwriters?
191                                                     Where do employees of Deroit Red Wings live?
228                       For which other teams do members of tanzanian national football team play?
275                                                  UEM group have made bridges crossing over what?
277                                                                  Where do bachelor of arts live?
337                     What does the famous relative of Levon Ashotovich Grigorian do for 

In [41]:
df2[df2['sparql_template_id'] == 305]['corrected_question'].head(n=30)

7                                      What layout can be found in cars similar to the Subaru Outback?
14                                     List the things for which people of New Orleans are famous for?
15                                            What sports are played in schools in the capital region?
18                          which awards have been given to people who fought in the Battle of France?
20                          Who built the stadium which was rented for the 2013 Copa Centroamericana ?
31                                                 Which genre of books are published by Random House?
35                                                     What are some Louisiana based models known for?
40     What is the religion of the member of parliament who is a relative of Sarathchandra Rajakaruna?
46                                   What are some other products of the banks which makes Postbanken?
48                                                        Who produces th

In [38]:
df2[df2['sparql_template_id'] == 305]['sparql_query'].head(n=20)

7                           SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/related> <http://dbpedia.org/resource/Subaru_Outback> . ?x <http://dbpedia.org/ontology/layout> ?uri  . ?x <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Automobile>}
14                          SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/placeOfBirth> <http://dbpedia.org/resource/New_Orleans> . ?x <http://dbpedia.org/property/knownFor> ?uri  . ?x <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Person>}
15                                SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/region> <http://dbpedia.org/resource/Capital_region> . ?x <http://dbpedia.org/ontology/sport> ?uri  . ?x <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/School>}
18                              SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/ontology/battle> <http://dbpedia.org/resource/Battle

In [39]:
df2[df2['sparql_template_id'] == 5]['sparql_query'].head(n=20)

8                                 SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/operator> <http://dbpedia.org/resource/Starwood_Hotels_and_Resorts_Worldwide> . ?x <http://dbpedia.org/property/address> ?uri  . }
23                                                          SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/products> <http://dbpedia.org/resource/Edsel_Ranger> . ?x <http://dbpedia.org/property/parent> ?uri  . }
51                                                        SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/position> <http://dbpedia.org/resource/Defenceman> . ?x <http://dbpedia.org/ontology/formerTeam> ?uri  . }
54                                                           SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/magazine> <http://dbpedia.org/resource/Jump_Square> . ?x <http://dbpedia.org/property/author> ?uri  . }
101                                                       SELECT DISTINCT ?uri WHERE { ?x <http:

In [31]:
data_dir = os.path.join('./', 'data')
lc_quad_dir = os.path.join(data_dir, 'lc-quad')
df_train = pd.read_json(os.path.join(lc_quad_dir, "train-data.json"))
df_test = pd.read_json(os.path.join(lc_quad_dir, "test-data.json"))
df = pd.concat([df_train, df_test], ignore_index = True)

In [34]:
df[df['sparql_template_id'].isin([306, 6, 305, 5, 303, 3, 311, 11])]['sparql_template_id'].value_counts()

305    564
3      262
5      213
306    175
303    115
6       94
311     76
11      20
Name: sparql_template_id, dtype: int64

In [83]:
df['length'] = df.apply(lambda row : len(row['corrected_question'].split()), axis=1)

In [88]:
df.groupby(['sparql_template_id'])['length'].mean()

sparql_template_id
1       8.345912
2       8.433155
3      11.893130
5      10.431925
6      10.808511
7      10.435484
8      12.606061
9      13.000000
11     14.900000
15     12.823232
16     15.258126
101     8.164179
102     8.730769
103    12.428571
105    11.707602
106    13.590909
107    11.600000
108    15.285714
111    16.342105
151     8.344444
152     8.765957
301     8.912621
303    14.495652
305    11.553191
306    11.645714
307    11.637681
308    14.766467
311    15.644737
315    14.600000
316    14.000000
401     8.883117
402     8.555556
403    12.941176
405    12.388889
406    12.885714
601     7.000000
605    10.500000
906    14.000000
Name: length, dtype: float64

In [32]:
df = pd.read_csv('data/lc-quad/dataset.csv')

In [44]:
template = 1
subset = df[df['sparql_template_id'] == template]['corrected_question']
subset.str.lower().str.split(expand=True).stack().value_counts() / len(subset)

the                0.452830
is                 0.371069
which              0.314465
of                 0.308176
who                0.301887
what               0.289308
are                0.232704
in                 0.176101
some               0.138365
was                0.119497
whose              0.113208
by                 0.088050
to                 0.081761
all                0.075472
a                  0.069182
have               0.069182
football           0.056604
name               0.056604
under              0.056604
as                 0.050314
for                0.050314
been               0.044025
has                0.044025
did                0.037736
famous             0.037736
people             0.037736
team?              0.037736
?                  0.037736
city               0.031447
rivers             0.031447
                     ...   
character's        0.006289
cementary?         0.006289
novel              0.006289
spanish            0.006289
shrine             0

In [45]:
template = 3
subset = df[df['sparql_template_id'] == template]['corrected_question']
subset.str.lower().str.split(expand=True).stack().value_counts() / len(subset)

the            1.713740
of             1.251908
is             0.416031
what           0.385496
which          0.366412
?              0.347328
who            0.179389
where          0.175573
in             0.164122
are            0.148855
by             0.133588
list           0.118321
awards         0.118321
to             0.110687
did            0.099237
won            0.083969
was            0.076336
successor      0.072519
city           0.072519
does           0.061069
religion       0.049618
country        0.049618
relatives      0.045802
place          0.041985
a              0.041985
producer       0.041985
located        0.041985
artist         0.038168
alma           0.034351
mater          0.030534
                 ...   
study,         0.003817
jon            0.003817
given?         0.003817
know           0.003817
nebo           0.003817
ann            0.003817
wood?          0.003817
cosmos?        0.003817
coached        0.003817
harry          0.003817
letter         0

In [48]:
with open('data/lc-quad/train/input.pos', 'r') as myfile:
    data = myfile.read().replace('\n', ' ')

In [52]:
freq = nltk.FreqDist(data.split())

In [55]:
freq

FreqDist({"''": 9,
          ',': 265,
          '.': 3464,
          ':': 35,
          'CC': 668,
          'CD': 308,
          'DT': 4006,
          'EX': 50,
          'FW': 23,
          'IN': 4573,
          'JJ': 1713,
          'JJR': 7,
          'JJS': 39,
          'LS': 1,
          'MD': 54,
          'NN': 6122,
          'NNP': 7703,
          'NNPS': 223,
          'NNS': 2200,
          'PDT': 64,
          'POS': 206,
          'PRP': 121,
          'PRP$': 80,
          'RB': 331,
          'RBR': 1,
          'RBS': 2,
          'RP': 24,
          'SYM': 1,
          'TO': 264,
          'UH': 4,
          'VB': 982,
          'VBD': 1027,
          'VBG': 130,
          'VBN': 1354,
          'VBP': 1012,
          'VBZ': 2174,
          'WDT': 1341,
          'WP': 1237,
          'WP$': 438,
          'WRB': 622,
          '``': 6})