In [29]:
import spacy
from collections import Counter
import gensim
from gensim import corpora
from gensim.models import Phrases

nlp = spacy.load("en_core_web_sm")

import nltk
from nltk.corpus import reuters #import reuters' documents from the nltk library

### Preprocessing

In [32]:
def preprocess_text(text):
    # lowercasing
    doc = nlp(text.lower())
    
    # Initialize an empty list to hold the preprocessed tokens
    preprocessed_tokens = []
    
    # calculate the token frequency in the text
    token_counts = Counter([token.text for token in doc if token.is_alpha])

    # Preprocessing
    for token in doc: #for each token check 
        if (token.is_alpha and # to remove non-alphabetic caracters (punctuations, numbers ...),
            not token.is_stop and # to remove stopwords,
            token_counts[token.text] > 2 and # to remove rare words,
            token_counts[token.text] < 10):  # to remove frequent words,
            
            # Apply Lemmatization
            preprocessed_tokens.append(token.lemma_)
    
    return preprocessed_tokens

In [33]:
result = []
docs = []
fileids = reuters.fileids() #Get the documents ids
for i in range(5):
    docs = reuters.raw(fileids[i])
    result.append(preprocess_text(docs)) # Get the text of the first document

In [35]:
print(result)

[['exporter', 'businessman', 'import', 'product', 'exporter', 'short', 'term', 'dlrs', 'tariff', 'import', 'japanese', 'electronic', 'semiconductor', 'japanese', 'tariff', 'billion', 'dlrs', 'electronic', 'export', 'product', 'japanese', 'electronics', 'tariff', 'export', 'tariff', 'taiwan', 'businessman', 'taiwan', 'surplus', 'billion', 'dlrs', 'year', 'surplus', 'taiwan', 'billion', 'dlrs', 'tariff', 'import', 'product', 'exporter', 'south', 'korea', 'south', 'korea', 'export', 'year', 'south', 'korea', 'surplus', 'billion', 'dlrs', 'billion', 'dlrs', 'businessman', 'semiconductor', 'hong', 'kong', 'semiconductor', 'electronic', 'businessman', 'short', 'term', 'import', 'short', 'term', 'hong', 'kong', 'industry', 'import', 'hong', 'kong', 'year', 'hong', 'kong', 'export', 'industry', 'minister', 'export', 'export', 'japanese', 'year', 'minister', 'minister', 'industry'], ['china', 'pct', 'pct', 'china', 'china', 'say', 'say', 'pct', 'china', 'pct', 'say'], ['energy', 'demand', 'miti

### Bag of Words

In [37]:
dictionary = corpora.Dictionary(result)
print(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in result]
print(corpus)   

{'billion': 0, 'businessman': 1, 'dlrs': 2, 'electronic': 3, 'electronics': 4, 'export': 5, 'exporter': 6, 'hong': 7, 'import': 8, 'industry': 9, 'japanese': 10, 'kong': 11, 'korea': 12, 'minister': 13, 'product': 14, 'semiconductor': 15, 'short': 16, 'south': 17, 'surplus': 18, 'taiwan': 19, 'tariff': 20, 'term': 21, 'year': 22, 'china': 23, 'pct': 24, 'say': 25, 'demand': 26, 'energy': 27, 'miti': 28, 'natural': 29, 'supply': 30, 'baht': 31, 'quarter': 32, 'cpo': 33, 'harahap': 34, 'indonesia': 35, 'oil': 36, 'palm': 37}
[[(0, 5), (1, 4), (2, 6), (3, 3), (4, 1), (5, 6), (6, 3), (7, 4), (8, 5), (9, 3), (10, 4), (11, 4), (12, 3), (13, 3), (14, 3), (15, 3), (16, 3), (17, 3), (18, 3), (19, 3), (20, 5), (21, 3), (22, 4)], [(23, 4), (24, 4), (25, 3)], [(24, 3), (25, 4), (26, 5), (27, 8), (28, 4), (29, 3), (30, 3)], [(0, 6), (25, 3), (31, 3), (32, 4)], [(25, 3), (33, 4), (34, 3), (35, 4), (36, 4), (37, 4)]]


### TD-IDF

In [38]:
#TF-IDF
from gensim import models
tfidf = models.TfidfModel(corpus)
for document in tfidf[corpus]:
    print(document)

[(0, 0.15935418401749438), (1, 0.2239207765318571), (2, 0.3358811647977857), (3, 0.16794058239889284), (4, 0.055980194132964275), (5, 0.3358811647977857), (6, 0.16794058239889284), (7, 0.2239207765318571), (8, 0.2799009706648214), (9, 0.16794058239889284), (10, 0.2239207765318571), (11, 0.2239207765318571), (12, 0.16794058239889284), (13, 0.16794058239889284), (14, 0.16794058239889284), (15, 0.16794058239889284), (16, 0.16794058239889284), (17, 0.16794058239889284), (18, 0.16794058239889284), (19, 0.16794058239889284), (20, 0.2799009706648214), (21, 0.16794058239889284), (22, 0.2239207765318571)]
[(23, 0.8655034152183875), (24, 0.4927513833513657), (25, 0.08999951361325781)]
[(24, 0.1520225789906905), (25, 0.04936258325861676), (26, 0.4450387008956973), (27, 0.7120619214331158), (28, 0.3560309607165579), (29, 0.2670232205374184), (30, 0.2670232205374184)]
[(0, 0.5627826994238668), (25, 0.06852701102344375), (31, 0.4942556884004231), (32, 0.6590075845338975)]
[(25, 0.04862458611152949),

### Bigrams and Trigrams

In [39]:
# Create bigram and trigram models
bigram = gensim.models.Phrases(result, min_count=2, threshold=5)  # Detect bigrams
trigram = gensim.models.Phrases(bigram[result], min_count=2, threshold=5)  # Detect trigrams based on bigrams

# Apply the bigram and trigram models to the corpus
bigram_result = [bigram[doc] for doc in result]  # Get bigrams
trigram_result = [trigram[bigram[doc]] for doc in result]  # Get trigrams

# Print out the bigram and trigram results
print("Bigrams:", bigram_result)
print("Trigrams:", trigram_result)

Bigrams: [['exporter', 'businessman', 'import', 'product', 'exporter', 'short_term', 'dlrs', 'tariff', 'import', 'japanese', 'electronic', 'semiconductor', 'japanese', 'tariff', 'billion_dlrs', 'electronic', 'export', 'product', 'japanese', 'electronics', 'tariff', 'export', 'tariff', 'taiwan', 'businessman', 'taiwan', 'surplus', 'billion_dlrs', 'year', 'surplus', 'taiwan', 'billion_dlrs', 'tariff', 'import', 'product', 'exporter', 'south_korea', 'south_korea', 'export', 'year', 'south_korea', 'surplus', 'billion_dlrs', 'billion_dlrs', 'businessman', 'semiconductor', 'hong_kong', 'semiconductor', 'electronic', 'businessman', 'short_term', 'import', 'short_term', 'hong_kong', 'industry', 'import', 'hong_kong', 'year', 'hong_kong', 'export', 'industry', 'minister', 'export', 'export', 'japanese', 'year', 'minister', 'minister', 'industry'], ['china', 'pct', 'pct', 'china', 'china', 'say', 'say', 'pct', 'china', 'pct', 'say'], ['energy', 'demand', 'miti_energy', 'supply', 'demand', 'energ

In [40]:
dictionary = corpora.Dictionary(trigram_result)
print(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in trigram_result]
print(corpus)

{'billion_dlrs': 0, 'businessman': 1, 'dlrs': 2, 'electronic': 3, 'electronics': 4, 'export': 5, 'exporter': 6, 'hong_kong': 7, 'import': 8, 'industry': 9, 'japanese': 10, 'minister': 11, 'product': 12, 'semiconductor': 13, 'short_term': 14, 'south_korea': 15, 'surplus': 16, 'taiwan': 17, 'tariff': 18, 'year': 19, 'china': 20, 'pct': 21, 'say': 22, 'demand': 23, 'energy': 24, 'miti_energy': 25, 'miti_energy_supply': 26, 'natural': 27, 'baht': 28, 'billion': 29, 'billion_baht': 30, 'quarter': 31, 'quarter_billion': 32, 'cpo': 33, 'harahap': 34, 'indonesia': 35, 'indonesia_palm_oil': 36, 'palm_oil': 37}
[[(0, 5), (1, 4), (2, 1), (3, 3), (4, 1), (5, 6), (6, 3), (7, 4), (8, 5), (9, 3), (10, 4), (11, 3), (12, 3), (13, 3), (14, 3), (15, 3), (16, 3), (17, 3), (18, 5), (19, 4)], [(20, 4), (21, 4), (22, 3)], [(21, 3), (22, 4), (23, 5), (24, 4), (25, 1), (26, 3), (27, 3)], [(22, 3), (28, 2), (29, 2), (30, 1), (31, 1), (32, 3)], [(22, 3), (33, 4), (34, 3), (35, 1), (36, 3), (37, 1)]]


In [41]:
# TF-IDF
from gensim import models
tfidf = models.TfidfModel(corpus)
for document in tfidf[corpus]:
    print(document)

[(0, 0.30599503068105227), (1, 0.24479602454484184), (2, 0.06119900613621046), (3, 0.1835970184086314), (4, 0.06119900613621046), (5, 0.3671940368172628), (6, 0.1835970184086314), (7, 0.24479602454484184), (8, 0.30599503068105227), (9, 0.1835970184086314), (10, 0.24479602454484184), (11, 0.1835970184086314), (12, 0.1835970184086314), (13, 0.1835970184086314), (14, 0.1835970184086314), (15, 0.1835970184086314), (16, 0.1835970184086314), (17, 0.1835970184086314), (18, 0.30599503068105227), (19, 0.24479602454484184)]
[(20, 0.8655034152183875), (21, 0.4927513833513657), (22, 0.08999951361325781)]
[(21, 0.21480126240633127), (22, 0.06974717354477868), (23, 0.6288202410901308), (24, 0.5030561928721047), (25, 0.12576404821802617), (26, 0.37729214465407857), (27, 0.37729214465407857)]
[(22, 0.09499183049472043), (28, 0.4567566557321679), (29, 0.4567566557321679), (30, 0.22837832786608395), (31, 0.22837832786608395), (32, 0.6851349835982519)]
[(22, 0.06915746471498917), (33, 0.6650705042238236)