<a href="https://colab.research.google.com/github/eltnpistolia/vlometadata/blob/main/index_records_extractorV4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing and Importing libraries

In [None]:
!pip install --upgrade pip --user
!pip install lxml --user
!pip install spacy_language_detection
!pip install --upgrade deepl

In [21]:
import pandas as pd
import pprint
import re
import random
import csv
import deepl
from collections import Counter

import requests
from io import StringIO
from lxml import etree
import spacy
from spacy import displacy
from spacy.language import Language
from spacy_language_detection import LanguageDetector

In [60]:
#write all the variables
"""
Here we define ...
"""
index_url = 'https://alpha-vlo.clarin.eu/data/clarin/results/cmdi/COllections_de_COrpus_Oraux_Numeriques_CoCoON_ex_CRDO_/'
csv_file = "descriptions_with_lang final.csv"
fr_sample100 = "100_sample_fr_descriptions.csv"
transl_csv = "translated_file.csv"
deepl_api_key = "b557e936-e866-56e2-693d-4204185cb69b:fx"
translator = deepl.Translator(deepl_api_key)

### Create a language detector and add it the pipeline.

In [23]:
@Language.factory("language_detector")
def get_lang_detector(nlp, name):
   return LanguageDetector()

ValueError: ignored

In [24]:
!python -m spacy download en_core_web_lg

2023-03-16 10:39:52.439213: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 10:39:52.439337: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-16 10:39:54.320902: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download

In [25]:
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("language_detector", last=True)

<spacy_language_detection.spacy_language_detector.LanguageDetector at 0x7fa5df316fd0>

### Get the descriptions

In [7]:


response = requests.get(index_url)

if response.status_code == 200:
    html = response.content.decode('utf-8')
    parser = etree.HTMLParser()
    tree = etree.fromstring(html, parser)

    xml_links = tree.xpath('//a[@href and substring(@href, string-length(@href) - 3)=".xml"]/@href')

else:
    print(f"Error: {response.status_code}")

descriptions = []

for xml_link in xml_links:
    xml_url = index_url + xml_link
    xml_response = requests.get(xml_url)
    if xml_response.status_code == 200:
        xml_content = xml_response.content
        xml_parser = etree.XMLParser()
        xml_tree = etree.fromstring(xml_content, xml_parser)
        try:
            description = xml_tree.xpath('//cmd:CMD/cmd:Components/cmdp:OLAC-DcmiTerms/cmdp:description//text()', 
                                          namespaces={"cmd": "http://www.clarin.eu/cmd/1",
                                                      "cmdp": "http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1288172614026",
                                                      "xsi": "http://www.w3.org/2001/XMLSchema-instance"})[0]
            description = str(description)
            description = re.sub(r"\n {2,}", '', description)
            description = re.sub("\n\t+", ' ', description)
            description = re.sub("\0+", '', description)
            description = re.sub(r"\n", ' ', description)
            doc = nlp(description)
            language = doc._.language["language"]
            confidence_score = doc._.language["score"]
            descriptions.append((language, confidence_score, doc))
        except:
            IndexError
            continue

KeyboardInterrupt: ignored

In [None]:
descriptions

### Write the descriptions in a .csv file

In [None]:
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Language', 'Confidence Score', 'Description'])
    for language, confidence_score, description in descriptions:
        doc = nlp(description)
        language = doc._.language["language"]
        confidence_score = doc._.language["score"]
        writer.writerow([language, confidence_score, description])

### Apply NER to the list with the descriptions

In [None]:
for description in descriptions:
    doc = nlp(description[2])
    displacy.render(doc, jupyter=True, style='ent')

### Apply NER to the .csv file.

In [None]:
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def visualize_entities(text):
    doc = nlp(text)
    return displacy.render(doc, style="ent", jupyter=True)

In [None]:
df = pd.read_csv(csv_file)
df.head()

In [None]:
df["Named Entities"] = df["Description"].apply(lambda x: extract_entities(x))
df.head()

In [None]:
df.to_csv(csv_file, index=False)

In [None]:
df.head(20)["Description"].apply(lambda x: visualize_entities(x))

## Applying MT to a sample of descriptions

### Creating the Sample CSV.

In [46]:
# read the CSV file
# cols = ["Language", "Confidence Score", "Description"]

df = pd.read_csv(csv_file, header = 0)
df["Confidence Score"] = df["Confidence Score"].astype(float)

# df["Confidence Score"] = df["Confidence Score"].astype(float)
# filter the rows based on language code and confidence score
df = df[(df["Language"] == "fr") & (df["Confidence Score"] >= 0.9)]

# create a sample of 100 rows
sample = df.sample(n=100, random_state=1)

# save the sample to a new CSV file
sample.to_csv("100_sample_fr_descriptions.csv", index=False, header=None)

### Translate 10 and create a csv

In [59]:
# Read the .csv file into a pandas dataframe
df = pd.read_csv(fr_sample100, header=None, usecols=[2])

# Randomly select 10 rows
sample_rows = random.sample(list(df.index), 10)

# Create an empty dataframe to store the translations
translated_df = pd.DataFrame(columns=['Original Description', 'Translated Description'])

# Translate the Description column for the selected rows using DeepL
for row in sample_rows:
    text = df.loc[row, 2]
    translation = translator.translate_text(text, target_lang='EN-GB')
    # translated_df = translated_df.append({'Original Description': text, 'Translated Description': translation.text}, ignore_index=True)
    translated_df = pd.concat([translated_df, pd.DataFrame({'Original Description': [text], 'Translated Description': [translation.text]})], ignore_index=True)


# Save the translated dataframe to a new .csv file
translated_df.to_csv("translated_file.csv", index=False)


### DeepL Usage

In [61]:
def check_character_limit():
    translator = deepl.Translator(deepl_api_key)
    usage = translator.get_usage()
    if usage.character.limit_exceeded:
        print("Character limit exceeded.")
    else:
        print(f"Character usage: {usage.character}")

In [None]:
check_character_limit()

## Applying NER to the Translated text