# Tasks for laboratory assignment 2

In [None]:
# imports section

import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from autocorrect import Speller
import nltk
from wordcloud import WordCloud

import gensim as gs
from gensim.test.utils import lee_corpus_list
from gensim.models import Word2Vec, TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
import gensim.downloader as api

import matplotlib.pyplot as plt

import json

# Download the necessary resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')


## Extract data from the json file 

Write a script, that would extract all the the data from the json file and return it in a list of texts for further use in the next tasks

In [None]:
def extract_texts_from_file(file_path):
    """
    Extract the text data from the json file

    Args:
        file_path (str): Relative location of the json file in the project.

    Returns:
        list: The list of texts as strings.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
        return data['texts']
    return None 

texts = extract_texts_from_file('resources/data.json')
print(texts)

## Process the texts

Write a script, that would process these texts. Processing should include:

- Lowercasing
- Removing Punctuation
- Removing Special Characters and Numbers
- Stemming or Lemmatization
- Handling Abbreviations
- Spelling Correction

In [None]:
def process_texts(texts):
    """
    Process the texts

    Args:
        texts (list): list of str texts to be processed.

    Returns:
        list: The list of texts as strings processed by script.
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    processed_texts = []
    
    for text in texts:
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token.isalpha() and not token.isdigit()]
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        processed_texts.append(' '.join(tokens))
    return processed_texts 

texts_processed = process_texts(texts)
print(texts_processed)

## Visualize as cloud of words

Visualize the texts as a cloud of words.

In [None]:
def visualize_texts(texts_processed):
    """
    Visualize the texts as cloud of words

    Args:
        texts_processed (list): list of texts, processed in previous task.

    Returns:
        None: None.
    """
    all_text = ' '.join(texts_processed)
    wordcloud = WordCloud(width=800, height=400).generate(all_text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
    return None

visualize_texts(texts_processed)

## Calculate cosine similarities using TFIDF

Calculate the TFIDF score and cosine similarity between the texts. You may use gensim, write your own script using numpy or use other module. Output the result as a matrix $n\times n$ of cosine similarity scores (where $n$ is the length of texts list).

**Extra credit**: use word2vec word embeddings in vector spaces to get better TFIDF scores by representing each word as a vector.

In [None]:
def cosine_similarity_matrix(matrix):
    """Calculate cosine similarity matrix using numpy."""
    norms = np.linalg.norm(matrix, axis=1, keepdims=True)
    norms[norms == 0] = 1  
    normalized_matrix = matrix / norms
    
    cosine_matrix = np.dot(normalized_matrix, normalized_matrix.T)
    return cosine_matrix

In [None]:
def calcuate_tfidf(texts_processed):
    """
    Calculate TFIDF score between the texts in the list.

    Args:
        texts_processed (list): list of texts, processed in previous task.

    Returns:
        cosine_matrix (list): a matrix of cosine similarity scores.
    """
    tokenized_texts = [text.split() for text in texts_processed]
    
    dictionary = Dictionary(tokenized_texts)
    corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
    tfidf_model = TfidfModel(corpus)
    tfidf_corpus = tfidf_model[corpus]
    
    word2vec_model = Word2Vec(
        sentences=tokenized_texts,
        vector_size=100,  
        window=5,  
        min_count=1, 
        workers=4, 
        sg=0  
    )
    
    document_vectors = []
    
    for _, doc_bow in enumerate(tfidf_corpus):
        doc_vector = np.zeros(word2vec_model.wv.vector_size)
        total_weight = 0.0
        
        for word_id, tfidf_score in doc_bow:
            word = dictionary[word_id]
            
            if word in word2vec_model.wv:
                word_embedding = word2vec_model.wv[word]
                doc_vector += word_embedding * tfidf_score
                total_weight += tfidf_score
        
        if total_weight > 0:
            doc_vector = doc_vector / total_weight
        
        document_vectors.append(doc_vector)
    
    document_vectors_matrix = np.array(document_vectors)
    
    cosine_matrix = cosine_similarity_matrix(document_vectors_matrix)
    
    return cosine_matrix

cosine_matrix = calcuate_tfidf(texts_processed)
print(cosine_matrix)

## Visualize data

Visualize the `cosine_matrix` data using matplotlib. Choose your own method of plotting the scores in a way, that the similarity indeces between texts would be instantly visible. Plot data about texts as well (a couple of first words in the document).

*Hint: remember heat maps, aren't they nice for this task?*

In [None]:
def visualize_data(cosine_matrix, texts_processed):
    """
    Visualize the cosine similarity matrix from the previous task. 

    Args:
        cosine_matrix (list): matrix of cosine similarity scores (values from 0.0 to 1.0 expected).
        texts_processed (list): list of texts, processed in previous task.

    Returns:
        None: None.
    """
    cosine_matrix = np.array(cosine_matrix)
    cosine_matrix = np.clip(cosine_matrix, -1, 1)
    
    labels = []
    for text in texts_processed:
        words = text.split()[:5]  
        label = ' '.join(words)
        if len(label) > 40:  
            label = label[:37] + '...'
        labels.append(label)
    
    _, ax = plt.subplots(figsize=(14, 12))
    
    im = ax.imshow(cosine_matrix, cmap='RdYlBu_r', aspect='auto', vmin=-1, vmax=1)
    
    ax.set_xticks(np.arange(len(labels)))
    ax.set_yticks(np.arange(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha='right', fontsize=9)
    ax.set_yticklabels(labels, fontsize=9)
    
    cbar = plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    cbar.set_label('Cosine Similarity', rotation=270, labelpad=20, fontsize=12)
    
    threshold = 0.1  
    for i in range(len(labels)):
        for j in range(len(labels)):
            value = cosine_matrix[i, j]
            if abs(value) >= threshold or i == j: 
                text_color = "white" if abs(value) > 0.5 else "black"
                text = ax.text(j, i, f'{value:.2f}',
                              ha="center", va="center",
                              color=text_color,
                              fontsize=8, fontweight='bold')
    
    ax.set_title("Cosine Similarity Matrix between Texts", 
                pad=20, fontsize=14, fontweight='bold')
    
    ax.set_xticks(np.arange(len(labels)) - 0.5, minor=True)
    ax.set_yticks(np.arange(len(labels)) - 0.5, minor=True)
    ax.grid(which="minor", color="gray", linestyle='-', linewidth=0.5, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return None

visualize_data(cosine_matrix, texts_processed)

## Analyse your own text using TFIDF (or any other method)

Analyse your own text using previous methods. You may use API fetching to get text data, download texts from Kaggle or use any texts you want. Find cosine similarities and visualize them.

In [None]:
def get_texts():
    """
    Get the texts you want to analyse. Either API fetching, Kaggle or any other source.

    Args:
        None: None.

    Returns:
        list: list of texts.
    """
    texts = [
        "Lord of the Mystery best anime?",
        "Best game of the year Expedition 33?",
        "Haiuk Yevhen hardcore gamer",
        "Twitch is dying?",
    ]
    return texts

texts = get_texts()
texts_processed = process_texts(texts)
visualize_texts(texts_processed)
cosine_matrix = calcuate_tfidf(texts_processed)
visualize_data(cosine_matrix, texts_processed)