<a href="https://colab.research.google.com/github/firststef/APLN-WSD/blob/master/WSD_Romanian_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy sklearn rowordnet
!python -m spacy download ro_core_news_lg
!pip install bs4
!pip install sparknlp pyspark

In [None]:
import spacy
from sklearn.feature_extraction import DictVectorizer
import rowordnet as rwn
import sparknlp

nlp = spacy.load("ro_core_news_lg")
vec = DictVectorizer()
wn = rwn.RoWordNet()
spark = sparknlp.start()

In [None]:
# context - word embeddings
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

# lemmatize
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = Tokenizer() \
.setInputCols("document") \
.setOutputCol("token")
lemmatizer = LemmatizerModel.pretrained("lemma", "ro") \
.setInputCols(["token"]) \
.setOutputCol("lemma")
nlp_pipeline = Pipeline(stages=[documentAssembler, tokenizer, lemmatizer])
light_pipeline = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text")))

# get_word_embeddings
documentAssembler2 = DocumentAssembler().setInputCol("text").setOutputCol("document")
tokenizer2 = Tokenizer() \
.setInputCols("document") \
.setOutputCol("token")
embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","ro") \
.setInputCols(["document", "token"]) \
.setOutputCol("embeddings")
pipeline2 = Pipeline(stages=[documentAssembler, tokenizer, embeddings])

In [None]:
def lemmatize(sentence):
  results = light_pipeline.annotate(sentence)
  return results['lemma']

def get_word_embeddings(sentence, word):
  nlp_sentence = nlp(sentence)
  try:
    word_lemma = lemmatize(word)[0]
  except Exception as e:
    print(word, sentence)
    raise e
  i = -1
  for idx, l in enumerate(lemmatize(sentence)):
    if l == word_lemma:
      i = idx
      break
  if i == -1:
    print(word_lemma, lemmatize(sentence))
    raise IndexError
  words = [w.text for w in nlp_sentence]
  start = 0 if i<=3 else i-3
  end = len(nlp_sentence)-1 if i>=len(nlp_sentence)-1 else i+4
  context = words[start:end]
  context = ' '.join(context)

  data = spark.createDataFrame([[sentence]]).toDF("text")
  result = pipeline2.fit(data).transform(data)
  # result.show()
  result = result.collect()[0].asDict()['embeddings']
  return [x.asDict()['embeddings'] for x in result]

#c = get_word_embeddings('In plus , dislocarea in Kosovo a circa 4.000 de militari americani ca parte a unei misiuni de mentinere a pacii ar costa anual 1,5 - 2 miliarde de dolari , de doua ori mai mult decit operatiunile similare americane din Bosnia .  Copyright 1996 - 2003 Evenimentul Zilei Online .  DRAMA REFUGIATILOR ALBANEZI DIN KOSOVO CAPATA accente DRAMATICE DRAMA REFUGIATILOR ALBANEZI DIN KOSOVO CAPATA <head>ACCENTE</head> DRAMATICE Joi , 01 Aprilie 1999 � Expulzarea etnicilor albanezi din Kosovo de catre sirbi continua in paralel cu bombardamentele NATO In Kosovo , trupele sirbe au atacat marti sate albaneze situate in nord - vestul provinciei , trupele UCK ripostind cu deosebita violenta . La Pristina s - a facut auzita o serie de explozii , trei proiectile ale NATO cazind la 500 de metri de manastirea Gracanita , situata in sud - estul orasului .  Alte trei explozii au avut loc in satele Novi Badovac , Susica si Livadje .  ', 'accent')
#print(c)

In [None]:
# syntactic - features extraction
def extract_hypernymes(word):
    hyper = []
    synset_ids = wn.synsets(literal=word.lemma_, pos=rwn.synset.Synset.Pos.NOUN)
    if len(synset_ids) > 0:
        c = wn.synset_to_hypernym_root(synset_ids[0])
        for c1 in c:
            if len(wn(c1).literals):
              hyper.append(wn(c1).literals[0])
    return hyper


def extract_feat_for_word(word, rel):
    hyper = []
    if word.pos_ == "NOUN":
        hyper = extract_hypernymes(word)

    return {
        "text": word.text,
        "pos": word.pos_,
        "morpho": word.morph,
        "lemma": word.lemma_,
        "dep": word.dep_,
        "hyper": hyper,
        "position_rel": rel
    }

def extract_syntactic_features(raw_text):
  doc = nlp(raw_text)
  ret = []
  for word in doc:
      vect = []
      if word.head != word:
          vect.append(extract_feat_for_word(word.head, "head"))
          for siblings in word.head.children:
              vect.append(extract_feat_for_word(siblings, "0" if word != siblings else "main"))
      else:
          vect.append(extract_feat_for_word(word, "main"))
      for chil in word.children:
          vect.append(extract_feat_for_word(chil, "children"))
      ret.append(vec.fit_transform(vect).toarray())
  return ret
print(extract_syntactic_features("Ana are mere"))

In [None]:
# semantic - NER extraction
import spacy

vec = DictVectorizer()
nlp2 = spacy.load("ro_core_news_lg")
nlp2.add_pipe("merge_entities")

def replace_ner(text):
    doc = nlp2(text)
    out = ""
    for tok in doc:
        text = tok.text
        if tok.ent_type_:
            text = tok.ent_type_
        out += text + tok.whitespace_
    return out

In [45]:
from bs4 import BeautifulSoup

def parse_xml(path):
  with open(path, encoding='windows-1252') as fp:
    soup = BeautifulSoup(fp, 'html.parser')
  data = soup.find_all('instance')
  new_data = []
  for inst in data:
    sentence = inst.find('context').decode_contents().strip('\n')
    a, *b = sentence.split('<head>')
    b, *c = ''.join(b).split('</head>')
    sentence = a + b.lower() + ''.join(c)
    new_data.append((sentence, inst.attrs['id'], b.lower()))
  return new_data

train_se3 = parse_xml("/content/drive/MyDrive/Colab Notebooks/wsd/RomanianLS.train")
test_se3 = parse_xml("/content/drive/MyDrive/Colab Notebooks/wsd/RomanianLS.test")

In [None]:
import numpy as np   

def prepare_data(data):
  new_data = []
  for x in data:
    sense = x[1].split('.')[0]
    sentence = x[0]
    word = x[2]
    #sentence = replace_ner(x[0]) # semantic
    context = get_word_embeddings(sentence, word)
    syntactic = extract_syntactic_features(sentence)
    new_data.append(([*context, *syntactic], x[1]))
    if len(new_data) >= 25:
      return new_data
  return new_data

train_data = prepare_data(train_se3)
print(train_data[0])

In [41]:
def to_nltk(train_data):
  new_train_data = []
  for t in train_data:
    obj = {}
    f, label = t
    count = 0
    for x in f:
      key = str(count)
      obj[key] = type(f)
      count += 1

    new_train_data.append((obj, label))
  return new_train_data

new_train_data = to_nltk(train_data)
print(new_train_data)

[({'0': <class 'list'>, '1': <class 'list'>, '2': <class 'list'>, '3': <class 'list'>, '4': <class 'list'>, '5': <class 'list'>, '6': <class 'list'>, '7': <class 'list'>, '8': <class 'list'>, '9': <class 'list'>, '10': <class 'list'>, '11': <class 'list'>, '12': <class 'list'>, '13': <class 'list'>, '14': <class 'list'>, '15': <class 'list'>, '16': <class 'list'>, '17': <class 'list'>, '18': <class 'list'>, '19': <class 'list'>, '20': <class 'list'>, '21': <class 'list'>, '22': <class 'list'>, '23': <class 'list'>, '24': <class 'list'>, '25': <class 'list'>, '26': <class 'list'>, '27': <class 'list'>, '28': <class 'list'>, '29': <class 'list'>, '30': <class 'list'>, '31': <class 'list'>, '32': <class 'list'>, '33': <class 'list'>, '34': <class 'list'>, '35': <class 'list'>, '36': <class 'list'>, '37': <class 'list'>, '38': <class 'list'>, '39': <class 'list'>, '40': <class 'list'>, '41': <class 'list'>, '42': <class 'list'>, '43': <class 'list'>, '44': <class 'list'>, '45': <class 'lis

In [43]:
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.classify import MaxentClassifier
import random

from nltk.corpus import senseval

random.shuffle(new_train_data)

algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(new_train_data, algorithm, max_iter=100)

classifier.show_most_informative_features(10)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -3.21888        0.040
             2          -3.09151        0.160
             3          -2.97518        0.920
             4          -2.87433        0.920
             5          -2.78659        0.920
             6          -2.70923        0.920
             7          -2.64017        0.920
             8          -2.57784        0.920
             9          -2.52108        0.920
            10          -2.46899        0.920
            11          -2.42086        0.920
            12          -2.37613        0.920
            13          -2.33436        0.920
            14          -2.29519        0.920
            15          -2.25830        0.920
            16          -2.22345        0.920
            17          -2.19043        0.920
            18          -2.15906        0.920
            19          -2.12919        0.920
 

In [48]:
test_data = prepare_data(test_se3)
new_test_data = to_nltk(test_data)

In [51]:
no_labels = [x[0] for x in new_test_data]
print(no_labels)
classifier.classify_many(new_test_data)

[{'0': <class 'list'>, '1': <class 'list'>, '2': <class 'list'>, '3': <class 'list'>, '4': <class 'list'>, '5': <class 'list'>, '6': <class 'list'>, '7': <class 'list'>, '8': <class 'list'>, '9': <class 'list'>, '10': <class 'list'>, '11': <class 'list'>, '12': <class 'list'>, '13': <class 'list'>, '14': <class 'list'>, '15': <class 'list'>, '16': <class 'list'>, '17': <class 'list'>, '18': <class 'list'>, '19': <class 'list'>, '20': <class 'list'>, '21': <class 'list'>, '22': <class 'list'>, '23': <class 'list'>, '24': <class 'list'>, '25': <class 'list'>, '26': <class 'list'>, '27': <class 'list'>, '28': <class 'list'>, '29': <class 'list'>, '30': <class 'list'>, '31': <class 'list'>, '32': <class 'list'>, '33': <class 'list'>, '34': <class 'list'>, '35': <class 'list'>, '36': <class 'list'>, '37': <class 'list'>, '38': <class 'list'>, '39': <class 'list'>, '40': <class 'list'>, '41': <class 'list'>, '42': <class 'list'>, '43': <class 'list'>, '44': <class 'list'>, '45': <class 'list

AttributeError: ignored