# Document structure

In [0]:
!pip install beautifulsoup4
!pip install lxml



In [0]:
import tarfile
!wget https://gebakx.github.io/ihlt/s2/resources/trial.tgz
!ls
with tarfile.open('trial.tgz', "r:gz") as tar:
  tar.extractall()
!ls

--2019-09-26 00:05:38--  https://gebakx.github.io/ihlt/s2/resources/trial.tgz
Resolving gebakx.github.io (gebakx.github.io)... 185.199.111.153, 185.199.108.153, 185.199.110.153, ...
Connecting to gebakx.github.io (gebakx.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2003 (2.0K) [application/octet-stream]
Saving to: ‘trial.tgz’


2019-09-26 00:05:38 (579 MB/s) - ‘trial.tgz’ saved [2003/2003]

sample_data  trial.tgz
sample_data  trial  trial.tgz


## Beatiful Soup

In [0]:
import urllib.request
from bs4 import BeautifulSoup

url = 'https://www.crummy.com/software/BeautifulSoup/bs4/doc/'
with urllib.request.urlopen(url) as response:
   dt = response.read().decode('utf8')

soup = BeautifulSoup(dt, 'xml')
print(soup.get_text()[:50])




Beautiful Soup Documentation  Beautiful Soup 4.


## NLTK tokenization

In [0]:
import nltk
nltk.download() 

In [0]:
import nltk
# sentence splitting
source = 'Men want children. They get relaxed with kids.'
sentences = nltk.sent_tokenize(source)
sentences

['Men want children.', 'They get relaxed with kids.']

In [0]:
nltk.word_tokenize(source)

['Men', 'want', 'children', '.', 'They', 'get', 'relaxed', 'with', 'kids', '.']

## Distance metrics

In [0]:
from nltk.metrics import jaccard_distance, 
jaccard_distance(set(['The','eats','fish','.']),
                 set(['The','eats','blue','fish','.']))

0.2

In [0]:
!ls trial
!head -n 10 trial/00-readme.txt 
!wc -l trial/00-readme.txt 

00-readme.txt  STS.gs.txt  STS.input.txt  STS.ouput.txt

			 SEMEVAL-2012 TASK 17

				 STS
		     Semantic Textual Similarity:
 
		     A Unified Framework for the
	      Evaluation of Modular Semantic Components


153 trial/00-readme.txt


In [0]:
text = !cat trial/00-readme.txt 
text

In [0]:
!cat trial/STS.input.txt
!wc -l trial/STS.input.txt

!cat trial/STS.gs.txt
!wc -l trial/STS.gs.txt

!cat trial/STS.ouput.txt
!wc -l trial/STS.ouput.txt

id1	The bird is bathing in the sink.	Birdie is washing itself in the water basin.
id2	In May 2010, the troops attempted to invade Kabul.	The US army invaded Kabul on May 7th last year, 2010.
id3	John said he is considered a witness but not a suspect.	"He is not a suspect anymore." John said.
id4	They flew out of the nest in groups.	They flew into the nest together.
id5	The woman is playing the violin.	The young lady enjoys listening to the guitar.
id6	John went horse back riding at dawn with a whole group of friends.	Sunrise at dawn is a magnificent view to take in if you wake up early enough for it.
6 trial/STS.input.txt
id1	0
id2	1
id3	2
id4	3
id5	4
id6	5
6 trial/STS.gs.txt
id1	1	99
id2	0	75
id3	4	13
id4	2	91
id5	3	48
id6	3	38
6 trial/STS.ouput.txt


## Lab Session 2

## Input data

In [0]:
import pandas as pd

input_data = pd.read_csv('trial/STS.input.txt', sep='\t', header=None)
input_data = input_data.astype(str)                
input_data

Unnamed: 0,0,1,2
0,id1,The bird is bathing in the sink.,Birdie is washing itself in the water basin.
1,id2,"In May 2010, the troops attempted to invade Ka...",The US army invaded Kabul on May 7th last year...
2,id3,John said he is considered a witness but not a...,He is not a suspect anymore. John said.
3,id4,They flew out of the nest in groups.,They flew into the nest together.
4,id5,The woman is playing the violin.,The young lady enjoys listening to the guitar.
5,id6,John went horse back riding at dawn with a who...,Sunrise at dawn is a magnificent view to take ...


In [0]:
# import pandas as pd

input_data = []
with open('trial/STS.input.txt') as f:
  # input_data = 
  input_data = [row.rstrip().split('\t') for row in f.readlines()]

input_data

[['id1',
  'The bird is bathing in the sink.',
  'Birdie is washing itself in the water basin.'],
 ['id2',
  'In May 2010, the troops attempted to invade Kabul.',
  'The US army invaded Kabul on May 7th last year, 2010.'],
 ['id3',
  'John said he is considered a witness but not a suspect.',
  '"He is not a suspect anymore." John said.'],
 ['id4',
  'They flew out of the nest in groups.',
  'They flew into the nest together.'],
 ['id5',
  'The woman is playing the violin.',
  'The young lady enjoys listening to the guitar.'],
 ['id6',
  'John went horse back riding at dawn with a whole group of friends.',
  'Sunrise at dawn is a magnificent view to take in if you wake up early enough for it.']]

## 2. Jacard distance

In [0]:
import numpy as np
from nltk.metrics import jaccard_distance

result = []
for in_data in input_data:
  result.append(jaccard_distance(set(nltk.word_tokenize((in_data[1]))),
                             set(nltk.word_tokenize((in_data[2])))))

result = 1 - np.array(result)
result

array([0.30769231, 0.26315789, 0.46666667, 0.45454545, 0.23076923,
       0.13793103])

## 3. Pearson correlation

In [0]:
from scipy.stats import pearsonr

gs = pd.read_csv('trial/STS.gs.txt', sep='\t', header=None)
refs = gs[1].values
print(list(reversed(refs)))
tsts = result
pearsonr(list(reversed(refs)), tsts)

[5, 4, 3, 2, 1, 0]


(0.3962389776119232, 0.43674734878224375)

## Discussions

Web: https://www.cs.york.ac.uk/semeval-2012/task6/index.html

Test with the training set https://www.cs.york.ac.uk/semeval-2012/task6/index.php%3Fid=data.html