# IMDB reviews exploration.
## Author: Vadym Tunik.

Dataset: Large Movie Review Dataset https://ai.stanford.edu/~amaas/data/sentiment/

In [1]:
import contextual_search_system_for_related_texts as c
from bow_model import BagOfWords, clean_text

### System that offers five IMDB reviews that are similar to the given one.
### (we intuitively believe that the user who wrote the review will be interested in finding a movie that evokes similar impressions)

In [None]:
texts = c.load_texts_from_folder(c.FOLDER_PATH, fraction=c.DATA_FRACTION)
bow_model = BagOfWords(use_bigrams=c.USE_BIGRAMS)
bow_matrix = bow_model.fit_transform(texts, vocab_min_frequency=c.VOCAB_MIN_FREQUENCY)

similar_indices, similar_distances = c.find_similar_texts(
    bow_matrix,
    chosen_index=c.CHOSEN_TEXT_INDEX,
    num_similar=c.NUM_RELATED_TO_FIND,
    metric='cosine'
)

print(f"\n--- Similarity Results ---")
chosen_text_original = texts[c.CHOSEN_TEXT_INDEX]
cleaned_chosen_text = clean_text(chosen_text_original)
print(f"\nChosen Text #{c.CHOSEN_TEXT_INDEX} (Cleaned Snippet):")
print(cleaned_chosen_text[:c.CHAR_LIMIT_FOR_TEXT] + ('...' if len(cleaned_chosen_text) > c.CHAR_LIMIT_FOR_TEXT else ''))

print(f"\nTop {c.NUM_RELATED_TO_FIND} Most Similar Texts:")
for i in range(len(similar_indices)):
    index = similar_indices[i]
    distance = similar_distances[i]
    related_text_original = texts[index]
    cleaned_related_text = clean_text(related_text_original)

    print(f"\nRank {i+1}: Text #{index} with Distance: {distance:.2f}")
    print(cleaned_related_text[:c.CHAR_LIMIT_FOR_TEXT] + ('...' if len(cleaned_related_text) > c.CHAR_LIMIT_FOR_TEXT else ''))


Loading data from: C:\Users\duina\repo\DA\imdb_reviews_analysis\aclImdb\train\unsup
Attempting to load 50000 files (100.0% of total 50000 text files)...
Successfully loaded 50000 out of 50000 attempted texts.

Starting BoW process (use_bigrams=False)...
Step 1: Cleaning and tokenizing texts...
____ Processed 1000 texts...
____ Processed 2000 texts...
____ Processed 3000 texts...
____ Processed 4000 texts...
____ Processed 5000 texts...
____ Processed 6000 texts...
____ Processed 7000 texts...
____ Processed 8000 texts...
____ Processed 9000 texts...
____ Processed 10000 texts...
____ Processed 11000 texts...
____ Processed 12000 texts...
____ Processed 13000 texts...
____ Processed 14000 texts...
____ Processed 15000 texts...
____ Processed 16000 texts...
____ Processed 17000 texts...
____ Processed 18000 texts...
____ Processed 19000 texts...
____ Processed 20000 texts...
____ Processed 21000 texts...
____ Processed 22000 texts...
____ Processed 23000 texts...
____ Processed 24000 te