### Installing and Importing libraries

In [None]:
!pip install --upgrade pip --user
!pip install lxml --user
!pip install spacy_language_detection

In [None]:
import pandas as pd
import pprint
import re
import csv
from collections import Counter

import requests
from io import StringIO
from lxml import etree
import spacy
from spacy import displacy
from spacy.language import Language
from spacy_language_detection import LanguageDetector

### Create a language detector and add it the pipeline.

In [4]:
@Language.factory("language_detector")
def get_lang_detector(nlp, name):
   return LanguageDetector()

In [None]:
!python -m spacy download en_core_web_lg

In [6]:
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("language_detector", last=True)

<spacy_language_detection.spacy_language_detector.LanguageDetector at 0x7fb765f9a490>

### Get the descriptions

In [None]:
index_url = 'https://alpha-vlo.clarin.eu/data/clarin/results/cmdi/COllections_de_COrpus_Oraux_Numeriques_CoCoON_ex_CRDO_/'
response = requests.get(index_url)

if response.status_code == 200:
    html = response.content.decode('utf-8')
    parser = etree.HTMLParser()
    tree = etree.fromstring(html, parser)

    xml_links = tree.xpath('//a[@href and substring(@href, string-length(@href) - 3)=".xml"]/@href')

else:
    print(f"Error: {response.status_code}")

descriptions = []

for xml_link in xml_links:
    xml_url = index_url + xml_link
    xml_response = requests.get(xml_url)
    if xml_response.status_code == 200:
        xml_content = xml_response.content
        xml_parser = etree.XMLParser()
        xml_tree = etree.fromstring(xml_content, xml_parser)
        try:
            description = xml_tree.xpath('//cmd:CMD/cmd:Components/cmdp:OLAC-DcmiTerms/cmdp:description//text()', 
                                          namespaces={"cmd": "http://www.clarin.eu/cmd/1",
                                                      "cmdp": "http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1288172614026",
                                                      "xsi": "http://www.w3.org/2001/XMLSchema-instance"})[0]
            description = str(description)
            description = re.sub(r"\n {2,}", '', description)
            description = re.sub("\n\t+", ' ', description)
            description = re.sub("\0+", '', description)
            description = re.sub(r"\n", ' ', description)
            doc = nlp(description)
            language = doc._.language["language"]
            confidence_score = doc._.language["score"]
            descriptions.append((language, confidence_score, doc))
        except:
            IndexError
            continue

In [None]:
descriptions

### Write the descriptions in a .csv file

In [None]:
with open('your_csv_file', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Language', 'Confidence Score', 'Description'])
    for language, confidence_score, description in descriptions:
        doc = nlp(description)
        language = doc._.language["language"]
        confidence_score = doc._.language["score"]
        writer.writerow([language, confidence_score, description])

### Apply NER to the list with the descriptions

In [None]:
for description in descriptions:
    doc = nlp(description[2])
    displacy.render(doc, jupyter=True, style='ent')

### Apply NER to the .csv file.

In [15]:
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def visualize_entities(text):
    doc = nlp(text)
    return displacy.render(doc, style="ent", jupyter=True)

In [7]:
df = pd.read_csv("your_csv_file")
df.head()

In [None]:
df["Named Entities"] = df["Description"].apply(lambda x: extract_entities(x))
df.head()

In [None]:
df.to_csv("your_csv_file", index=False)

In [None]:
df.head(20)["Description"].apply(lambda x: visualize_entities(x))