# Morphology


## Requirements

Only in case nltk and the file do not exist in the current computer

In [0]:
import tarfile
import nltk
nltk.download() #1. d, 2. book, 3. q

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')



!wget https://gebakx.github.io/ihlt/s2/resources/trial.tgz

with tarfile.open('trial.tgz', "r:gz") as tar:
  tar.extractall()



In [3]:
!ls

sample_data  trial  trial.tgz


## Part of Speech


In [4]:
from nltk import pos_tag

words = ['Women','want','children', 'men', 'cars']
pairs = pos_tag(words)
pairs

[('Women', 'NNP'),
 ('want', 'VBP'),
 ('children', 'NNS'),
 ('men', 'NNS'),
 ('cars', 'NNS')]

## WordNet lemmatizer

In [0]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def lemmatize(p):
    if p[1][0] in {'N','V'}:
        return wnl.lemmatize(p[0].lower(), pos=p[1][0].lower())
    return p[0]

In [19]:
[lemmatize(pair) for pair in pairs]

['woman', 'want', 'child', 'men', 'car']

# Lab Session 3

## Input data

In [6]:
import pandas as pd

input_data = pd.read_csv('trial/STS.input.txt', sep='\t', names=['id', 'sent1', 'sent2'])
input_data = input_data.astype(str)                
input_data

Unnamed: 0,id,sent1,sent2
0,id1,The bird is bathing in the sink.,Birdie is washing itself in the water basin.
1,id2,"In May 2010, the troops attempted to invade Ka...",The US army invaded Kabul on May 7th last year...
2,id3,John said he is considered a witness but not a...,He is not a suspect anymore. John said.
3,id4,They flew out of the nest in groups.,They flew into the nest together.
4,id5,The woman is playing the violin.,The young lady enjoys listening to the guitar.
5,id6,John went horse back riding at dawn with a who...,Sunrise at dawn is a magnificent view to take ...


## Tagging and lemmatization

In [7]:
def tagging(df, col):
  sentence = df[col].head(1).values[0]
  tokens = nltk.word_tokenize(sentence)
  tags = nltk.pos_tag(tokens)
  print(tags)

tagging(input_data, 'sent1')
# sentence_1 = input_data.head(1)[1].values[0]
# sentence_2 = input_data.head(1)[2].values[0]
# print(sentence_1)
# words = nltk.word_tokenize(sentence_1)
# pairs = pos_tag(words)
# pairs

[('The', 'DT'), ('bird', 'NN'), ('is', 'VBZ'), ('bathing', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('sink', 'NN'), ('.', '.')]


In [0]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def lemmatize(pairs):
  result = []
  for pair in pairs:
    if pair[1][0] in {'N','V'}:
        result.append(wnl.lemmatize(pair[0].lower(), 
                                     pos=pair[1][0].lower()))
    else:
      result.append(pair[0])
  return result

In [9]:
for col in ['sent1', 'sent2']:
  input_data[col+'_processed'] = input_data[col].apply(nltk.word_tokenize)
  input_data[col+'_processed'] = input_data[col+'_processed'].apply(nltk.pos_tag)
  input_data[col+'_processed'] = input_data[col+'_processed'].apply(lemmatize)

input_data

Unnamed: 0,id,sent1,sent2,sent1_processed,sent2_processed
0,id1,The bird is bathing in the sink.,Birdie is washing itself in the water basin.,"[The, bird, be, bath, in, the, sink, .]","[birdie, be, wash, itself, in, the, water, bas..."
1,id2,"In May 2010, the troops attempted to invade Ka...",The US army invaded Kabul on May 7th last year...,"[In, may, 2010, ,, the, troop, attempt, to, in...","[The, u, army, invade, kabul, on, may, 7th, la..."
2,id3,John said he is considered a witness but not a...,He is not a suspect anymore. John said.,"[john, say, he, be, consider, a, witness, but,...","[He, be, not, a, suspect, anymore, ., john, sa..."
3,id4,They flew out of the nest in groups.,They flew into the nest together.,"[They, fly, out, of, the, nest, in, group, .]","[They, fly, into, the, nest, together, .]"
4,id5,The woman is playing the violin.,The young lady enjoys listening to the guitar.,"[The, woman, be, play, the, violin, .]","[The, young, lady, enjoy, listen, to, the, gui..."
5,id6,John went horse back riding at dawn with a who...,Sunrise at dawn is a magnificent view to take ...,"[john, go, horse, back, rid, at, dawn, with, a...","[sunrise, at, dawn, be, a, magnificent, view, ..."


## Calculating the Jacard distance

It just measures how close or far are the given sentences, it is a not so robust way to measure two equivalents sentences. 

It is a weak model.

In [10]:
import numpy as np
import nltk

from nltk.metrics import jaccard_distance

result = []

for index, row in input_data.iterrows():
  result.append(jaccard_distance(set(row['sent1_processed']),
                             set(row['sent2_processed'])))

result = 1 - np.array(result)
result

array([0.30769231, 0.33333333, 0.53846154, 0.45454545, 0.23076923,
       0.13793103])

## Calculating the Pearson correlation

First we gather the gold standard results (the reference) and compare with our previous results. It seems that the gold standard should be in reversed order, so that action is performed before comparing. Furthermore, the previous results were multiply by 5, just to have the same scale (although the pearson correlation is robust to scaling).

For this example, the score is 0.455, that indicates somehow, a bad score (the maximun is 1.0) but is better than using only words. It means that both lists have the same direction when plotted, but they are not so well correlated.



In [11]:
import pandas as pd

from scipy.stats import pearsonr

gs = pd.read_csv('trial/STS.gs.txt', sep='\t', header=None)
refs = list(reversed(gs[1].values))
print(f'Gold standard: {refs}')
tsts = result * 5
print(f'Jaccard distance: {tsts}')
print(f'Pearson correlation: {pearsonr(refs, tsts)[0]}')


Gold standard: [5, 4, 3, 2, 1, 0]
Jaccard distance: [1.53846154 1.66666667 2.69230769 2.27272727 1.15384615 0.68965517]
Pearson correlation: 0.45509668497522504


## Optional: Evaluate against training data set

### Download and read training data set

In [0]:
url_train = 'https://www.cs.york.ac.uk/semeval-2012/task6/data/uploads/datasets/train.tgz'
!wget $url_train

with tarfile.open('train.tgz', "r:gz") as tar:
  tar.extractall()
!ls

In [20]:
import pandas as pd

train = pd.read_csv('train/STS.input.MSRpar.txt', sep='\t', names=['sent1', 'sent2'])
train = train.astype(str)                
train.head(5)

Unnamed: 0,sent1,sent2
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl..."
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent..."
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....


## Pre-process the input data

In [21]:
for col in ['sent1', 'sent2']:
  train[col+'_processed'] = train[col].apply(nltk.word_tokenize)
  train[col+'_processed'] = train[col+'_processed'].apply(nltk.pos_tag)
  train[col+'_processed'] = train[col+'_processed'].apply(lemmatize)

train.head(5)

Unnamed: 0,sent1,sent2,sent1_processed,sent2_processed
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...,"[But, other, source, close, to, the, sale, say...","[But, other, source, close, to, the, sale, say..."
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...,"[micron, have, declare, its, first, quarterly,...","[micron, 's, number, also, mark, the, first, q..."
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl...","[The, fine, be, part, of, failed, republican, ...","[perry, say, he, back, the, senate, 's, effort..."
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent...","[The, American, anglican, council, ,, which, r...","[The, American, anglican, council, ,, which, r..."
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....,"[The, tech-loaded, nasdaq, composite, rise, 20...","[The, technology-laced, nasdaq, composite, ind..."


## Jacard distance

In [23]:
result = []

for index, row in train.iterrows():
  result.append(jaccard_distance(set(row['sent1_processed']),
                             set(row['sent2_processed'])))

result = 1 - np.array(result)
result[:10]

array([0.5483871 , 0.42105263, 0.33333333, 0.63333333, 0.23333333,
       0.22580645, 0.38888889, 0.57142857, 0.47826087, 0.42307692])

## Pearson correlation

In [61]:
gs = pd.read_csv('train/STS.gs.MSRpar.txt', names=['gs'])
gs['gs'] = gs['gs'].astype(float)
refs = list(gs['gs'][:728].values)
print(f'Gold standard: {refs[:10]}')
tsts = result * 5
print(f'Jaccard distance: {tsts[:10]}')
print(f'Pearson correlation: {pearsonr(refs, tsts)[0]}')

Gold standard: [4.0, 3.75, 2.8, 3.4, 2.4, 1.3330000000000002, 4.6, 3.8, 4.2, 2.6]
Jaccard distance: [2.74193548 2.10526316 1.66666667 3.16666667 1.16666667 1.12903226
 1.94444444 2.85714286 2.39130435 2.11538462]
Pearson correlation: 0.18680517470005817


In [16]:
!ls train

00-readme.txt	   STS.gs.MSRvid.txt	   STS.input.MSRvid.txt
correlation.pl	   STS.gs.SMTeuroparl.txt  STS.input.SMTeuroparl.txt
STS.gs.MSRpar.txt  STS.input.MSRpar.txt    STS.output.MSRpar.txt
