In [26]:
import spacy
from collections import Counter
import gensim
from gensim import corpora
from gensim.models import Phrases

nlp = spacy.load("en_core_web_sm")

import nltk
from nltk.corpus import reuters #import reuters' documents from the nltk library

### Preprocessing

In [2]:
def preprocess_text(text):
    # lowercasing
    doc = nlp(text.lower())
    
    # Initialize an empty list to hold the preprocessed tokens
    preprocessed_tokens = []
    
    # calculate the token frequency in the text
    token_counts = Counter([token.text for token in doc if token.is_alpha])

    # Preprocessing
    for token in doc: #for each token check 
        if (token.is_alpha and # to remove non-alphabetic caracters (punctuations, numbers ...),
            not token.is_stop and # to remove stopwords,
            token_counts[token.text] > 2 and # to remove rare words,
            token_counts[token.text] < 10):  # to remove frequent words,
            
            # Apply Lemmatization
            preprocessed_tokens.append(token.lemma_)
    
    return preprocessed_tokens

In [43]:
result = []
docs = []
fileids = reuters.fileids() #Get the documents ids
for i in range(100):
    docs = reuters.raw(fileids[i])
    result.append(preprocess_text(docs)) # Get the text of the first document

In [44]:
result

[['exporter',
  'businessman',
  'import',
  'product',
  'exporter',
  'short',
  'term',
  'dlrs',
  'tariff',
  'import',
  'japanese',
  'electronic',
  'semiconductor',
  'japanese',
  'tariff',
  'billion',
  'dlrs',
  'electronic',
  'export',
  'product',
  'japanese',
  'electronics',
  'tariff',
  'export',
  'tariff',
  'taiwan',
  'businessman',
  'taiwan',
  'surplus',
  'billion',
  'dlrs',
  'year',
  'surplus',
  'taiwan',
  'billion',
  'dlrs',
  'tariff',
  'import',
  'product',
  'exporter',
  'south',
  'korea',
  'south',
  'korea',
  'export',
  'year',
  'south',
  'korea',
  'surplus',
  'billion',
  'dlrs',
  'billion',
  'dlrs',
  'businessman',
  'semiconductor',
  'hong',
  'kong',
  'semiconductor',
  'electronic',
  'businessman',
  'short',
  'term',
  'import',
  'short',
  'term',
  'hong',
  'kong',
  'industry',
  'import',
  'hong',
  'kong',
  'year',
  'hong',
  'kong',
  'export',
  'industry',
  'minister',
  'export',
  'export',
  'japanese',


### Bag of Words

In [37]:
dictionary = corpora.Dictionary(result)
print(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in result]
print(corpus)   

{'billion': 0, 'businessman': 1, 'dlrs': 2, 'electronic': 3, 'electronics': 4, 'export': 5, 'exporter': 6, 'hong': 7, 'import': 8, 'industry': 9, 'japanese': 10, 'kong': 11, 'korea': 12, 'minister': 13, 'product': 14, 'semiconductor': 15, 'short': 16, 'south': 17, 'surplus': 18, 'taiwan': 19, 'tariff': 20, 'term': 21, 'year': 22, 'china': 23, 'pct': 24, 'say': 25, 'demand': 26, 'energy': 27, 'miti': 28, 'natural': 29, 'supply': 30, 'baht': 31, 'quarter': 32, 'cpo': 33, 'harahap': 34, 'indonesia': 35, 'oil': 36, 'palm': 37}
[[(0, 5), (1, 4), (2, 6), (3, 3), (4, 1), (5, 6), (6, 3), (7, 4), (8, 5), (9, 3), (10, 4), (11, 4), (12, 3), (13, 3), (14, 3), (15, 3), (16, 3), (17, 3), (18, 3), (19, 3), (20, 5), (21, 3), (22, 4)], [(23, 4), (24, 4), (25, 3)], [(24, 3), (25, 4), (26, 5), (27, 8), (28, 4), (29, 3), (30, 3)], [(0, 6), (25, 3), (31, 3), (32, 4)], [(25, 3), (33, 4), (34, 3), (35, 4), (36, 4), (37, 4)]]


### TD-IDF

In [38]:
#TF-IDF
from gensim import models
tfidf = models.TfidfModel(corpus)
for document in tfidf[corpus]:
    print(document)

[(0, 0.15935418401749438), (1, 0.2239207765318571), (2, 0.3358811647977857), (3, 0.16794058239889284), (4, 0.055980194132964275), (5, 0.3358811647977857), (6, 0.16794058239889284), (7, 0.2239207765318571), (8, 0.2799009706648214), (9, 0.16794058239889284), (10, 0.2239207765318571), (11, 0.2239207765318571), (12, 0.16794058239889284), (13, 0.16794058239889284), (14, 0.16794058239889284), (15, 0.16794058239889284), (16, 0.16794058239889284), (17, 0.16794058239889284), (18, 0.16794058239889284), (19, 0.16794058239889284), (20, 0.2799009706648214), (21, 0.16794058239889284), (22, 0.2239207765318571)]
[(23, 0.8655034152183875), (24, 0.4927513833513657), (25, 0.08999951361325781)]
[(24, 0.1520225789906905), (25, 0.04936258325861676), (26, 0.4450387008956973), (27, 0.7120619214331158), (28, 0.3560309607165579), (29, 0.2670232205374184), (30, 0.2670232205374184)]
[(0, 0.5627826994238668), (25, 0.06852701102344375), (31, 0.4942556884004231), (32, 0.6590075845338975)]
[(25, 0.04862458611152949),

### Bigrams and Trigrams

In [45]:
# Create bigram and trigram models
bigram = gensim.models.Phrases(result, min_count=2, threshold=5)  # Detect bigrams
trigram = gensim.models.Phrases(bigram[result], min_count=2, threshold=5)  # Detect trigrams based on bigrams

# Apply the bigram and trigram models to the corpus
bigram_result = [bigram[doc] for doc in result]  # Get bigrams
trigram_result = [trigram[bigram[doc]] for doc in result]  # Get trigrams

# Print out the bigram and trigram results
print("Bigrams:", bigram_result)
print("Trigrams:", trigram_result)

Bigrams: [['exporter', 'businessman', 'import_product', 'exporter', 'short_term', 'dlrs', 'tariff', 'import', 'japanese', 'electronic', 'semiconductor', 'japanese', 'tariff', 'billion_dlrs', 'electronic', 'export', 'product', 'japanese', 'electronics', 'tariff', 'export', 'tariff', 'taiwan', 'businessman', 'taiwan', 'surplus', 'billion_dlrs', 'year', 'surplus', 'taiwan', 'billion_dlrs', 'tariff', 'import_product', 'exporter', 'south_korea', 'south_korea', 'export', 'year', 'south_korea', 'surplus', 'billion_dlrs', 'billion_dlrs', 'businessman', 'semiconductor', 'hong_kong', 'semiconductor', 'electronic', 'businessman', 'short_term', 'import', 'short_term', 'hong_kong', 'industry', 'import', 'hong_kong', 'year', 'hong_kong', 'export', 'industry', 'minister', 'export', 'export', 'japanese', 'year', 'minister', 'minister', 'industry'], ['china', 'pct', 'pct', 'china', 'china', 'say', 'say', 'pct', 'china', 'pct', 'say'], ['energy', 'demand', 'miti_energy', 'supply', 'demand', 'energy', 'd

In [10]:
dictionary = corpora.Dictionary(trigram_result)
print(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in trigram_result]
print(corpus)

{'billion': 0, 'businessman': 1, 'dlrs': 2, 'electronic': 3, 'electronics': 4, 'export': 5, 'exporter': 6, 'hong': 7, 'import': 8, 'industry': 9, 'japanese': 10, 'kong': 11, 'korea': 12, 'minister': 13, 'product': 14, 'semiconductor': 15, 'short': 16, 'south': 17, 'surplus': 18, 'taiwan': 19, 'tariff': 20, 'term': 21, 'year': 22, 'china': 23, 'pct': 24, 'say': 25, 'demand': 26, 'energy': 27, 'miti': 28, 'natural': 29, 'supply': 30, 'baht': 31, 'quarter': 32, 'cpo': 33, 'harahap': 34, 'indonesia': 35, 'oil': 36, 'palm': 37}
[[(0, 5), (1, 4), (2, 6), (3, 3), (4, 1), (5, 6), (6, 3), (7, 4), (8, 5), (9, 3), (10, 4), (11, 4), (12, 3), (13, 3), (14, 3), (15, 3), (16, 3), (17, 3), (18, 3), (19, 3), (20, 5), (21, 3), (22, 4)], [(23, 4), (24, 4), (25, 3)], [(24, 3), (25, 4), (26, 5), (27, 8), (28, 4), (29, 3), (30, 3)], [(0, 6), (25, 3), (31, 3), (32, 4)], [(25, 3), (33, 4), (34, 3), (35, 4), (36, 4), (37, 4)]]


In [41]:
# TF-IDF
from gensim import models
tfidf = models.TfidfModel(corpus)
for document in tfidf[corpus]:
    print(document)

[(0, 0.30599503068105227), (1, 0.24479602454484184), (2, 0.06119900613621046), (3, 0.1835970184086314), (4, 0.06119900613621046), (5, 0.3671940368172628), (6, 0.1835970184086314), (7, 0.24479602454484184), (8, 0.30599503068105227), (9, 0.1835970184086314), (10, 0.24479602454484184), (11, 0.1835970184086314), (12, 0.1835970184086314), (13, 0.1835970184086314), (14, 0.1835970184086314), (15, 0.1835970184086314), (16, 0.1835970184086314), (17, 0.1835970184086314), (18, 0.30599503068105227), (19, 0.24479602454484184)]
[(20, 0.8655034152183875), (21, 0.4927513833513657), (22, 0.08999951361325781)]
[(21, 0.21480126240633127), (22, 0.06974717354477868), (23, 0.6288202410901308), (24, 0.5030561928721047), (25, 0.12576404821802617), (26, 0.37729214465407857), (27, 0.37729214465407857)]
[(22, 0.09499183049472043), (28, 0.4567566557321679), (29, 0.4567566557321679), (30, 0.22837832786608395), (31, 0.22837832786608395), (32, 0.6851349835982519)]
[(22, 0.06915746471498917), (33, 0.6650705042238236)

## Taggers

In [3]:
import plac
import random
from pathlib import Path
import spacy
from spacy.training import Example

In [250]:
TAG_MAP = {
    'N': 'NOUN',
    'V': 'VERB',
    'J':'ADJ',
    'NNS' : 'NOUN'
}

TRAIN_DATA = [
    ("I like green eggs", {'tags': ['N', 'V', 'J', 'NNS']}), 
    ("Eat blue ham", {'tags': ['V', 'J', 'N']}), 
]

In [253]:
def main(lang='en', output_dir=None, n_iter=3):

    nlp = spacy.blank(lang) #Créer un modèle spacy vide pour l'anglais
    tagger = nlp.add_pipe("tagger") #Créer un modèle (tagger) qui attribue des tags aux mots d'un texte 

    for item in TAG_MAP:
        tagger.add_label(item) #Ajouter des tags aux tagger

    optimizer = nlp.begin_training()
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], sgd=optimizer, losses=losses)
        print(losses)

    test_text = "I love cats"
    doc = nlp(test_text)
    print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])


In [254]:
main()

{'tagger': 5.233839750289917}
{'tagger': 4.370360851287842}
{'tagger': 2.9379818439483643}
Tags [('I', 'N', ''), ('love', 'V', ''), ('cats', 'NNS', '')]


## Ner Taggin

In [1]:
import plac
import random
from pathlib import Path
import spacy
# training data
TRAIN_DATA = [
('Who is Shaka Khan?', {
    'entities': [(7, 17, 'PERSON')]
}),
('I like London and Berlin.', {
    'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
}),
 ('I am learning Python', {
    'entities': [(14, 20, 'TECH')]
})
]

In [7]:
def main(model=None, output_dir=None, n_iter=100):
    
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
        
        if 'ner' not in nlp.pipe_names:
            ner = nlp.add_pipe('ner', last=True)
    # otherwise, get it so we can add labels
        else:
            ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    
    # get names of other pipes to disable them during training 
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                example = Example.from_dict(nlp.make_doc(text), annotations)
                nlp.update(
                    [example],
                    drop=0.5, # dropout-make it harder to memorise data
                    sgd=optimizer, # callable to update weights
                    losses=losses)
            print(losses)
            
    # test the trained model
    doc = nlp('I love Python')
    print()
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [8]:
main()

Created blank 'en' model
{'ner': 13.707800328731537}
{'ner': 12.919663846492767}
{'ner': 11.507942080497742}
{'ner': 10.41474586725235}
{'ner': 9.270761519670486}
{'ner': 7.802250295877457}
{'ner': 6.526705324649811}
{'ner': 5.892342917621136}
{'ner': 6.269278216175735}
{'ner': 5.874239942058921}
{'ner': 6.915091509465128}
{'ner': 5.222462212201208}
{'ner': 4.845556299958844}
{'ner': 5.384911714587361}
{'ner': 4.623987916682381}
{'ner': 7.581715064850869}
{'ner': 3.501235793111846}
{'ner': 3.5995443491265178}
{'ner': 4.172266533860238}
{'ner': 3.0489110057242215}
{'ner': 3.4836665840266505}
{'ner': 3.081052483757958}
{'ner': 3.0997456966488244}
{'ner': 3.073208049734376}
{'ner': 1.6782227643139862}
{'ner': 4.133288230419396}
{'ner': 2.541362160071003}
{'ner': 0.8770357887357232}
{'ner': 1.436662475178501}
{'ner': 1.1867294459195326}
{'ner': 1.8348952525510072}
{'ner': 0.9213925234892738}
{'ner': 1.2702341690707881}
{'ner': 2.1776027171745396}
{'ner': 0.4078910874332304}
{'ner': 0.68641

## tp4

In [38]:
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example

# Load a blank spaCy model
nlp = spacy.blank("en")

# Add the "Quantity" label to the parser
if "parser" not in nlp.pipe_names:
    parser = nlp.add_pipe("parser")
else:
    parser = nlp.get_pipe("parser")
parser.add_label("Quantity")

# training data 
training_data = [
    ("Few people attended the meeting", {"heads": [1, 2, 2, 2, 2], "deps": ["Quantity", "nsubj", "ROOT", "det", "dobj"]}),
    ("We need many volunteers", {"heads": [1, 1, 1, 1], "deps": ["nsubj", "ROOT", "Quantity", "dobj"]}),
    ("Some books are more interesting than others", {"heads": [1, 2, 2, 3, 2, 5, 4], "deps": ["Quantity", "nsubj", "ROOT", "advmod", "acomp", "prep", "pobj"]}),
    ("She knows all her neighbors", {"heads": [1, 1, 4, 4, 1], "deps": ["nsubj", "ROOT", "Quantity", "poss", "dobj"]}),
    ("He read half the book", {"heads": [1, 1, 4, 4, 1], "deps": ["nsubj", "ROOT", "Quantity", "det", "dobj"]}),
    ("She read the whole book in one day", {"heads": [1, 1, 4, 4, 1, 4, 5, 1], "deps": ["nsubj", "ROOT", "det", "Quantity", "dobj", "prep", "nummod", "pobj"]}),
    ("I have enough time", {"heads": [1, 1, 1, 1], "deps": ["nsubj", "ROOT", "Quantity", "dobj"]}),
    ("There were numerous complaints about the noise", {"heads": [1, 1, 3, 1, 5, 3, 5], "deps": ["expl", "ROOT", "Quantity", "attr", "prep", "det", "pobj"]})
]

# Train the model
optimizer = nlp.begin_training()
for i in range(19):  
    for text, annotations in training_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, {"heads": annotations["heads"], "deps": annotations["deps"]})
        nlp.update([example], sgd=optimizer)

# Test the model
test_text = ["Few places are this beautiful",
            "We have many plants at home",
            "Some people prefer to work in the evening",
            "She gave all her energy",             
            "He drank half the cup of water",            
            "She met with the whole team",
            "We have enough time to finish the task",
            "There are numerous doctors in the region"]
for text in test_text:
    doc = nlp(text)
    for token in doc:
        print(token.text, token.dep_, token.head.text)

    print()

Few Quantity places
places dep are
are ROOT are
this dep beautiful
beautiful dep are

We dep have
have ROOT have
many Quantity have
plants dep have
at dep home
home dep have

Some Quantity people
people dep prefer
prefer ROOT prefer
to dep work
work dep prefer
in dep work
the dep in
evening dep prefer

She dep gave
gave ROOT gave
all Quantity energy
her dep energy
energy dep gave

He dep drank
drank ROOT drank
half Quantity cup
the dep cup
cup dep drank
of dep water
water ROOT water

She dep met
met ROOT met
with dep met
the dep with
whole Quantity team
team dep with

We dep have
have ROOT have
enough Quantity have
time dep have
to dep have
finish dep the
the dep to
task dep the

There dep are
are ROOT are
numerous Quantity doctors
doctors dep are
in dep doctors
the dep in
region dep in

