<a href="https://colab.research.google.com/github/eltnpistolia/vlometadata/blob/main/VLO_metadata_descriptions_NER_V4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing and Importing libraries

In [None]:
!pip install --upgrade pip --user
!pip install lxml --user
!pip install spacy_language_detection
!pip install --upgrade deepl

In [None]:
import pandas as pd
import pprint
import re
import random
import csv
import deepl
from collections import Counter

import requests
from io import StringIO
from lxml import etree
import spacy
from spacy import displacy
from spacy.language import Language
from spacy_language_detection import LanguageDetector

In [16]:
#write all the variables
"""
Here we define ...
"""
index_url = 'https://alpha-vlo.clarin.eu/data/clarin/results/cmdi/COllections_de_COrpus_Oraux_Numeriques_CoCoON_ex_CRDO_/'
csv_file = "descriptions_with_lang final.csv"
fr_sample100 = "100_sample_fr_descriptions.csv"
transl_csv = "translated_file.csv"
transl_with_NE = "NE.csv"
deepl_api_key = "b557e936-e866-56e2-693d-4204185cb69b:fx"
translator = deepl.Translator(deepl_api_key)

### Create a language detector and add it the pipeline.

In [9]:
@Language.factory("language_detector")
def get_lang_detector(nlp, name):
   return LanguageDetector()

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("language_detector", last=True)

### Get the descriptions

In [None]:
response = requests.get(index_url)

if response.status_code == 200:
    html = response.content.decode('utf-8')
    parser = etree.HTMLParser()
    tree = etree.fromstring(html, parser)

    xml_links = tree.xpath('//a[@href and substring(@href, string-length(@href) - 3)=".xml"]/@href')

else:
    print(f"Error: {response.status_code}")

descriptions = []

for xml_link in xml_links:
    xml_url = index_url + xml_link
    xml_response = requests.get(xml_url)
    if xml_response.status_code == 200:
        xml_content = xml_response.content
        xml_parser = etree.XMLParser()
        xml_tree = etree.fromstring(xml_content, xml_parser)
        try:
            description = xml_tree.xpath('//cmd:CMD/cmd:Components/cmdp:OLAC-DcmiTerms/cmdp:description//text()', 
                                          namespaces={"cmd": "http://www.clarin.eu/cmd/1",
                                                      "cmdp": "http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1288172614026",
                                                      "xsi": "http://www.w3.org/2001/XMLSchema-instance"})[0]
            description = str(description)
            description = re.sub(r"\n {2,}", '', description)
            description = re.sub("\n\t+", ' ', description)
            description = re.sub("\0+", '', description)
            description = re.sub(r"\n", ' ', description)
            doc = nlp(description)
            language = doc._.language["language"]
            confidence_score = doc._.language["score"]
            descriptions.append((language, confidence_score, doc))
        except:
            IndexError
            continue

In [None]:
descriptions

### Write the descriptions in a .csv file

In [None]:
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Language', 'Confidence Score', 'Description'])
    for language, confidence_score, description in descriptions:
        doc = nlp(description)
        language = doc._.language["language"]
        confidence_score = doc._.language["score"]
        writer.writerow([language, confidence_score, description])

### Apply NER to the list with the descriptions

In [None]:
for description in descriptions:
    doc = nlp(description[2])
    displacy.render(doc, jupyter=True, style='ent')

## Applying MT to a sample of descriptions

### Creating the Sample CSV.

In [46]:
df = pd.read_csv(csv_file, header = 0)
df["Confidence Score"] = df["Confidence Score"].astype(float)
df = df[(df["Language"] == "fr") & (df["Confidence Score"] >= 0.9)]
sample = df.sample(n=100, random_state=1)

sample.to_csv("100_sample_fr_descriptions.csv", index=False, header=None)

### Translate 10 and create a csv

In [59]:
df = pd.read_csv(fr_sample100, header=None, usecols=[2])

sample_rows = random.sample(list(df.index), 10)

translated_df = pd.DataFrame(columns=['Original Description', 'Translated Description'])

for row in sample_rows:
    text = df.loc[row, 2]
    translation = translator.translate_text(text, target_lang='EN-GB')
    translated_df = pd.concat([translated_df, pd.DataFrame({'Original Description': [text], 'Translated Description': [translation.text]})], ignore_index=True)

translated_df.to_csv("translated_file.csv", index=False)

### DeepL Usage

In [61]:
def check_character_limit():
    translator = deepl.Translator(deepl_api_key)
    usage = translator.get_usage()
    if usage.character.limit_exceeded:
        print("Character limit exceeded.")
    else:
        print(f"Character usage: {usage.character}")

In [None]:
check_character_limit()

### Apply NER to .csv file.

In [11]:
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def visualize_entities(text):
    doc = nlp(text)
    return displacy.render(doc, style="ent", jupyter=True)

In [None]:
df = pd.read_csv(transl_csv)
df.head()

In [None]:
df["Named Entities"] = df["Translated Description"].apply(lambda x: extract_entities(x))
df.head()

In [17]:
df.to_csv(transl_with_NE, index=False)

In [None]:
for text in df["Translated Description"].values:
    visualize_entities(text)