# Lab 5

In [1]:
import wikipediaapi
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')  
nltk.download('stopwords') 
nltk.download('wordnet') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\diana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
wiki_wiki = wikipediaapi.Wikipedia(language = "en", user_agent = "python-requests/x")
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [4]:
def extract_document(article_title):
    page = wiki_wiki.page(article_title)
    if page.exists():
        article_content = page.text
        return article_content
    else:
        raise Exception(f"The article '{article_title}' does not exist.")

In [5]:
geography_articles = ["Romania", "Continental climate", "Central Asia", "Desert", "Polar regions of Earth"] 
history_articles = ["World War I", "Ottoman Empire", "Central Powers", "Carol I of Romania", "Alexandru Ioan Cuza"]
music_articles = ["Octave", "Solfège", "Ludwig van Beethoven", "Kapellmeister", "Wolfgang Amadeus Mozart"]

#### 1. Use text preprocessing techniques (stemming/lematization, stop words removal) and create the bag-of-words and TF-IDF vectorizations

In [6]:
def preprocess_text(text, use_stemming=True, use_lemmatization=True):
    # tokenization 
    words = word_tokenize(text)

    # remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]
    
    # stemming
    if use_stemming:
        words = [stemmer.stem(word) for word in words]
    # lemmatization
    if use_lemmatization:
        words = [lemmatizer.lemmatize(word) for word in words]

    cleaned_text = ' '.join(words)
    return cleaned_text

In [7]:
# extract and preprocess
all_documents = []
for article_title in geography_articles + history_articles + music_articles:
    article_content = extract_document(article_title)
    if article_content:
        preprocessed_text = preprocess_text(article_content)
        all_documents.append(preprocessed_text)

In [8]:
# bag-of-words and TF-IDF vectorizations
count_vectorizer = CountVectorizer()
bag_of_words = count_vectorizer.fit_transform(all_documents)

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_documents)

#### 2.Use Latent Semantic Analysis with SVD for a) the bag-of-words encoding and b) the TF-IDF encoding

a) bag-of-words encoding

In [9]:
lsa_bow = TruncatedSVD(n_components=3)  
lsa_bow_result = lsa_bow.fit_transform(bag_of_words)

In [11]:
print(lsa_bow_result)

[[ 1.65013861e+02 -4.32573724e+01  6.90101908e+01]
 [ 1.32832114e+01 -5.90949998e+00  1.72863779e+01]
 [ 1.35207719e+02 -2.48120980e+01  1.67922332e+02]
 [ 8.00844425e+01 -3.33574907e+01  1.85052850e+02]
 [ 6.18165012e+00 -1.18365002e+00  8.37231661e+00]
 [ 4.25490919e+02 -2.45776777e+02 -1.19155890e+02]
 [ 4.16310925e+02  3.02458854e+02 -3.57257638e+01]
 [ 1.21040912e+02 -3.42785364e+01 -3.73052332e+01]
 [ 5.77522035e+01 -1.68771669e+01  7.73838422e+00]
 [ 1.92717898e+01 -2.25029270e+00  5.23922018e+00]
 [ 8.63692660e+00 -2.09371484e-01  8.69551550e+00]
 [ 2.36154534e+01  4.92771005e-01  1.74422869e+01]
 [ 8.62066262e+01 -2.77221727e+01  1.92753693e+02]
 [ 1.29414525e+01 -3.34067495e+00  1.29122291e+01]
 [ 4.38779007e+01 -1.14100649e+01  7.01113022e+01]]


b) TF-IDF encoding

In [12]:
lsa_tfidf = TruncatedSVD(n_components=3) 
lsa_tfidf_result = lsa_tfidf.fit_transform(tfidf_matrix)
print(lsa_tfidf_result)

[[ 0.6515496  -0.04927958 -0.21448861]
 [ 0.1125157   0.03260077  0.36035918]
 [ 0.3918041   0.00259362  0.39408612]
 [ 0.1970676   0.09197144  0.45561148]
 [ 0.13892616  0.04860623  0.47196891]
 [ 0.74652064 -0.16752997  0.114366  ]
 [ 0.61196719 -0.10304951  0.11263035]
 [ 0.71082036 -0.24616751  0.15371572]
 [ 0.59348185 -0.08079813 -0.43573194]
 [ 0.43790626 -0.02798495 -0.44519833]
 [ 0.08140569  0.32784776  0.08004793]
 [ 0.16098901  0.38550554  0.08723378]
 [ 0.22118518  0.5973567  -0.06482254]
 [ 0.15435439  0.51884501 -0.08225064]
 [ 0.19623109  0.63648751 -0.0718288 ]]


#### 3. Use Non-negative matrix factorization

In [13]:
# bag-of-words encoding
nmf_bow = NMF(n_components=3, init='random', random_state=0)
nmf_bow_result = nmf_bow.fit_transform(bag_of_words)

print(nmf_bow_result)

[[6.20029225e-01 3.59033932e-01 9.36174822e-01]
 [9.41094653e-03 1.51659810e-02 2.03588871e-01]
 [6.22427599e-02 0.00000000e+00 2.19035047e+00]
 [0.00000000e+00 0.00000000e+00 1.87116329e+00]
 [4.10204244e-03 8.95492026e-03 9.50354588e-02]
 [2.50700588e+00 8.95586010e-02 2.49244314e-02]
 [2.50706208e+00 0.00000000e+00 0.00000000e+00]
 [7.25547037e-01 0.00000000e+00 0.00000000e+00]
 [2.74208214e-01 1.57490014e-01 1.52218841e-01]
 [8.62321671e-02 8.47707389e-02 5.72963922e-02]
 [2.26338375e-02 1.75067518e-01 3.29584207e-02]
 [7.94465168e-02 3.36854637e-01 8.01072243e-02]
 [0.00000000e+00 5.65139906e+00 0.00000000e+00]
 [3.50916662e-02 3.65285622e-01 1.57341139e-02]
 [5.04796120e-02 1.89874917e+00 5.74468135e-02]]


In [14]:
# bag-of-words encoding
nmf_tfidf = NMF(n_components=3, init='random', random_state=0)  
nmf_tfidf_result = nmf_tfidf.fit_transform(tfidf_matrix)

print(nmf_tfidf_result)

[[0.02030037 0.15058672 0.00126084]
 [0.0128651  0.00041327 0.00212605]
 [0.04428451 0.00531392 0.00387212]
 [0.02048417 0.00124481 0.00996292]
 [0.01651073 0.         0.00337802]
 [0.08775876 0.00892126 0.0012241 ]
 [0.07322559 0.00112287 0.00332864]
 [0.09210914 0.         0.        ]
 [0.00701923 0.17285355 0.        ]
 [0.         0.15613746 0.        ]
 [0.         0.         0.04161704]
 [0.00251553 0.00025967 0.05164886]
 [0.00048959 0.00379173 0.07993362]
 [0.         0.         0.06833608]
 [0.         0.         0.08367664]]


#### 4. Use LDA (Latent Dirichlet Allocation)

In [15]:
lda_bow = LatentDirichletAllocation(n_components=3, random_state=0)
lda_bow_result = lda_bow.fit_transform(bag_of_words)
print(lda_bow_result)

[[6.73879484e-01 5.82113696e-05 3.26062305e-01]
 [2.14623296e-04 9.99567369e-01 2.18007273e-04]
 [6.51030766e-05 9.13368465e-01 8.65664322e-02]
 [6.32030556e-05 9.99870219e-01 6.65775809e-05]
 [8.33838848e-04 9.98274697e-01 8.91463801e-04]
 [3.30588616e-05 3.32203818e-05 9.99933721e-01]
 [4.01336314e-05 3.94124284e-05 9.99920454e-01]
 [1.63962097e-04 1.66687662e-04 9.99669350e-01]
 [7.99167000e-02 1.69485297e-04 9.19913815e-01]
 [4.01181752e-04 3.69447324e-04 9.99229371e-01]
 [9.99059517e-01 4.72484613e-04 4.67998709e-04]
 [9.99541848e-01 2.26015600e-04 2.32135996e-04]
 [9.99870086e-01 6.33257629e-05 6.65886519e-05]
 [9.99389822e-01 2.93721121e-04 3.16456819e-04]
 [9.99786555e-01 1.04006415e-04 1.09438415e-04]]


In [16]:
lda_tfidf = LatentDirichletAllocation(n_components=3, random_state=0)
lda_tfidf_result = lda_tfidf.fit_transform(tfidf_matrix)
print(lda_tfidf_result)

[[0.01322952 0.0132161  0.97355438]
 [0.02702501 0.94501093 0.02796406]
 [0.01847919 0.01879002 0.96273079]
 [0.02132493 0.95581157 0.0228635 ]
 [0.0369273  0.92404078 0.03903192]
 [0.01367734 0.01357951 0.97274316]
 [0.01685997 0.01659923 0.9665408 ]
 [0.02169244 0.02164742 0.95666014]
 [0.02081244 0.0205949  0.95859266]
 [0.02199317 0.02180831 0.95619852]
 [0.90828468 0.0453578  0.04635753]
 [0.94365216 0.02751833 0.02882951]
 [0.95612485 0.02127617 0.02259898]
 [0.94361264 0.02750418 0.02888317]
 [0.95082869 0.02388471 0.0252866 ]]


#### Visualizations

In [34]:
n_top_words = 10 

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        print(f"Topic #{topic_idx}: {' '.join(top_words)}")

print_top_words(lda_bow, count_vectorizer.get_feature_names_out(), n_top_words)

Topic #0: beethoven mozart music romania work year romanian includ first compos
Topic #1: desert central asia region border water climat kazakhstan asian area
Topic #2: ottoman war empir german state power germani world 000 forc
