# IMDB reviews exploration.
## Author: Vadym Tunik.

Dataset: Large Movie Review Dataset https://ai.stanford.edu/~amaas/data/sentiment/

In [1]:
from contextual_search_system_for_related_texts import *

### System that offers five IMDB reviews that are similar to the given one.
### (we intuitively believe that the user who wrote the review will be interested in finding a movie that evokes similar impressions)

In [3]:
texts = load_texts_from_folder(FOLDER_PATH, fraction=DATA_FRACTION)
bow_model = BagOfWords(use_bigrams=USE_BIGRAMS)
bow_matrix = bow_model.fit_transform(texts, vocab_min_frequency=VOCAB_MIN_FREQUENCY)

similar_indices, similar_distances = find_similar_texts(
    bow_matrix,
    chosen_index=CHOSEN_TEXT_INDEX,
    num_similar=NUM_RELATED_TO_FIND,
    metric='cosine' #'cosine'
)

print(f"\n--- Similarity Results ---")
chosen_text_original = texts[CHOSEN_TEXT_INDEX]
cleaned_chosen_text = clean_text(chosen_text_original)
print(f"\nChosen Text #{CHOSEN_TEXT_INDEX} (Cleaned Snippet):")
print(cleaned_chosen_text[:CHAR_LIMIT_FOR_TEXT] + ('...' if len(cleaned_chosen_text) > CHAR_LIMIT_FOR_TEXT else ''))

print(f"\nTop {NUM_RELATED_TO_FIND} Most Similar Texts:")
for i in range(len(similar_indices)):
    index = similar_indices[i]
    distance = similar_distances[i]
    related_text_original = texts[index]
    cleaned_related_text = clean_text(related_text_original)

    print(f"\nRank {i+1}: Text #{index} with Distance: {distance:.2f}")
    print(cleaned_related_text[:CHAR_LIMIT_FOR_TEXT] + ('...' if len(cleaned_related_text) > CHAR_LIMIT_FOR_TEXT else ''))


Loading data from: C:\Users\duina\repo\DA\imdb_reviews_analysis\aclImdb\train\unsup
Attempting to load 2500 files (5.0% of total 50000 text files)...
Successfully loaded 2500 out of 2500 attempted texts.

Starting BoW process (use_bigrams=False)...
Step 1: Cleaning and tokenizing texts...
____ Processed 1000 texts...
____ Processed 2000 texts...
____ Initial token count (unique): 29740
Step 2: Building final vocabulary...
____ Postprocessing: Min Freq=5, Stopwords=Yes, Stemming=Yes
____ Words after frequency filter: 7962
____ Words after stopword filter: 7832
____ Unique stemmed words: 5337
____ Final vocabulary size: 5337
Step 3: Creating BoW matrix...
____ Vectorized 1000 texts...
____ Vectorized 2000 texts...
____ BoW matrix created. Shape: (2500, 5337)

BoW process finished.
____ Final Matrix Shape: (2500, 5337)
____ Total time: 5.12 seconds

Calculating similarity using 'cosine' distance...

--- Similarity Results ---

Chosen Text #2025 (Cleaned Snippet):
how do we beginthe presi