In [141]:
import gensim
import json
import pandas
import numpy as np
import sklearn
import sklearn.neighbors
from sklearn import cluster, covariance, manifold
import lxml
import lxml.objectify
from lxml import etree
import nltk
import nltk.corpus
from nltk.corpus import stopwords
STOPS = set(stopwords.words('english'))

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [142]:
claims_file = 'data/claim_conclusion.json'
with open(claims_file) as fh:
    claims_df = pandas.read_json(fh)

In [143]:
claims_df.tail()

Unnamed: 0,claim,conclusion,url
669,Trade union sign up for those aged between 25 ...,This is not quite correct. 13% of the UK popul...,economy/union-membership-among-young-people
670,£1 in every £10 that goes to the NHS is to pay...,This isn’t correct according to the latest cos...,health/what-nhs-paying-private-finance-initiat...
671,The Treasury loses £40 billion each year due t...,"That’s a long way off from HMRC’s estimates, ...",economy/does-treasury-lose-40-billion-each-yea...
672,A man who assaulted his wife was spared jail b...,"We don’t know for sure, but it seems unlikely ...",law/domestic-violence-vulnerable-mustafa-bashir
673,Putting a doctor through medical school “costs...,"That’s wrong. About £64,300 of that comes in s...",health/cost-training-doctor


In [144]:
def claim2vec(text):
    word_vecs = []
    for word in text.split():
        if word not in STOPS and word in model:
            word_vecs.append(model[word])
    
    if word_vecs:
        result = np.mean(word_vecs, axis=0)
        assert len(result) == 300
        return result
    else:
        return np.zeros(300, dtype=np.float32)

In [145]:
claims_df['vec'] = claims_df['claim'].apply(claim2vec)

In [146]:
claims_df.tail()

Unnamed: 0,claim,conclusion,url,vec
669,Trade union sign up for those aged between 25 ...,This is not quite correct. 13% of the UK popul...,economy/union-membership-among-young-people,"[-0.0107422, 0.00634766, -0.0569946, -0.090368..."
670,£1 in every £10 that goes to the NHS is to pay...,This isn’t correct according to the latest cos...,health/what-nhs-paying-private-finance-initiat...,"[-0.00469971, 0.0252609, 0.0643616, 0.0410461,..."
671,The Treasury loses £40 billion each year due t...,"That’s a long way off from HMRC’s estimates, ...",economy/does-treasury-lose-40-billion-each-yea...,"[0.141553, 0.127283, -0.0599688, 0.110437, -0...."
672,A man who assaulted his wife was spared jail b...,"We don’t know for sure, but it seems unlikely ...",law/domestic-violence-vulnerable-mustafa-bashir,"[0.000762939, 0.0124207, 0.0871887, -0.0160217..."
673,Putting a doctor through medical school “costs...,"That’s wrong. About £64,300 of that comes in s...",health/cost-training-doctor,"[-0.0274658, 0.0705261, 0.0146942, 0.0796051, ..."


In [147]:
X = np.array(claims_df['vec'].tolist())

In [148]:
for example in [200, 300, 400]:
    dists, neighbours = nn.kneighbors(X[example], n_neighbors=673)
    dists = dists[0]
    neighbours = neighbours[0]
    print('*** Original ***:\n', claims_df.iloc[neighbours[0], 0])
    print('\n*** Top 5 ***:\n', claims_df.iloc[neighbours[1:], 0].head(5).values)
    print('\n*** Bottom 5 ***:\n', claims_df.iloc[neighbours, 0].tail(5).values)

*** Original ***:
 The EU referendum outcome is "on a knife edge".

*** Top 5 ***:
 ['The EU referendum was “advisory” only.'
 'The government’s EU leaflet distributed before the referendum said that leaving the EU meant leaving the single market.'
 'Campaigners on both sides of the EU referendum made false claims.'
 'The Vote Leave campaign includes three completely untrue claims on its EU referendum leaflet: Turkey becoming a member, an EU army and the £350 million a week cost of membership.'
 'If we vote ‘remain’... The EU will continue to control… trade']

*** Bottom 5 ***:
 ['Free schools improve neighbouring schools. '
 'Low-paid workers are trapped in poverty.'
 '44.2% of Southern Mainline and Coast trains were not at terminus on time.'
 '95% of new workers are foreigners.'
 '58.8% of graduates are in non-graduate jobs. ']




----

In [None]:
xml_file = 'data/hansard/src/debates2017-01-09a.xml'

import os
base = 'data/hansard/src/'
for fname in os.listdir(base):
    xml_file = os.path.join(base, fname)
    
    try:
        with open(xml_file) as fh:
            xml = etree.parse(fh)
    except UnicodeDecodeError:
        continue

    for i, p in enumerate(xml.findall('.//p')):
        if not p.text or len(p.text) < 140:
            continue
        
        for sent in p.text.split('.'):
            vec = claim2vec(sent)
            dists, neighbours = nn.kneighbors([vec], n_neighbors=5)
            dists = dists[0]
            dists = dists[dists <= 0.15]
            if len(dists) == 0:
                continue
            neighbours = neighbours[0][:len(dists)].tolist()
            print('*** Politician said ***:\n', sent)
            print('\n*** Relevant claims ***:\n', claims_df.iloc[neighbours, 0].head(5).values)

            print('-'*50)

*** Politician said ***:
  The TUC estimates suggest that tax avoidance costs us £25 billion a year, while tax evasion costs us £70 billion a year

*** Relevant claims ***:
 ['Tax evasion and tax avoidance costs the government £34 billion a year.']
--------------------------------------------------
*** Politician said ***:
  The TUC estimates suggest that tax avoidance costs us £25 billion a year, while tax evasion costs us £70 billion a year

*** Relevant claims ***:
 ['Tax evasion and tax avoidance costs the government £34 billion a year.']
--------------------------------------------------
*** Politician said ***:
  In 2009, Greece's budget deficit was running at 13

*** Relevant claims ***:
 ['In 2010 the budget deficit was running at over 10% of the UK’s GDP.']
--------------------------------------------------
*** Politician said ***:
  At the same time the gulf between rich and poor has got wider, with the attainment gap between students in fee-paying schools and those in state 