### Global Code and Functions
Run this first to import modules and global functions

In [None]:
#Modules
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from IPython.display import Image, display
import re
import math
import torch

# Models
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity


# Global Functions

def get_top50_ann(target_embedding, embeddings):
    nn = NearestNeighbors(n_neighbors=51, metric='cosine', algorithm='brute')
    nn.fit(embeddings)
    distances, indices = nn.kneighbors([target_embedding])
    return indices[0][1:]  # Skip the first index because it's the target itself

def get_top50_euclidean(target_embedding, embeddings):
    distances = [distance.euclidean(target_embedding, emb) for emb in embeddings]
    indices = np.argsort(distances)[1:51]  # Skip the first index because it's the target itself
    return indices

def display_image(index):
    display(Image(filename=f"thumbnails_folder2large/{g_movie_embeddings[index]['input']}"))

def display_images(indices, embeddings):
    fig, axes = plt.subplots(10, 10, figsize=(20, 10))
    for i, ax in enumerate(axes.flat):
        ax.imshow(plt.imread(f"thumbnails_folder2large/{g_movie_embeddings[indices[i]]['input']}"))
        ax.axis('off')
    plt.show()

def display_images_first_x_last_x(indices, first_x, last_x, cluster_n=0):
    # Select the first_x and last_x indices
    selected_indices = indices[:first_x] + indices[-last_x:]
    
    # Calculate the number of rows and columns for the subplot
    total_images = first_x + last_x
    cols = 10
    rows = math.ceil(total_images / cols)
    
    fig, axes = plt.subplots(rows, cols, figsize=(20, 2 * rows))
    axes = axes.ravel()  # Flatten the axes array
    
    # Hide all axes
    for ax in axes:
        ax.axis('off')
    
    # Display images on the first len(selected_indices) axes
    for i, idx in enumerate(selected_indices):
        axes[i].imshow(plt.imread(f"thumbnails_folder2large/{g_movie_embeddings[idx]['input']}"))
        axes[i].axis('on')
    
    plt.tight_layout()
    plt.title(f"Cluster {cluster_n} -  First {first_x} and Last {last_x} Images - Total Images in Cluster: {len(indices)}")
    plt.show()

def display_cluster_images(cluster_labels, cluster_number):
    # Get indices of images in the cluster
    indices = [i for i, label in enumerate(cluster_labels) if label == cluster_number]
    
    # Display images
    display_images(indices)

def display_cluster_images_first_last_x(cluster_labels, cluster_number, first_x, last_x):
    # Get indices of images in the cluster
    indices = [i for i, label in enumerate(cluster_labels) if label == cluster_number]
    
    # Display images
    display_images_first_x_last_x(indices, first_x, last_x, cluster_number)

def find_and_remove_intro_and_subtitles(g_only_embeddings, threshold=0.7):
    # Load the CLIP model and processor
    model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

    # Get the text embeddings for the intro and subtitles
    inputs = processor(text=["Image of Walt Disney Movie Intro", "Walt Disney Movie Intro", "Movie closing credits", "Movie end credits", "Image with lots of closing credits"], return_tensors="pt", padding=True)
    text_embeddings = model.get_text_features(**inputs)
    text_embeddings_np = text_embeddings.detach().numpy()

    # Calculate the cosine similarity between the text embeddings and the movie embeddings
    similarities = cosine_similarity(text_embeddings_np, g_only_embeddings)

    # Find the indices of the embeddings that are similar to the intro and subtitles
    intro_subtitle_indices = np.where(similarities.max(axis=0) > threshold)[0]

    print("Number of images(Intro and Closing credits) to remove:", intro_subtitle_indices)

    # Remove the intro and subtitles from the movie embeddings
    for idx in sorted(intro_subtitle_indices, reverse=True):
        del g_movie_embeddings[idx]
        g_only_embeddings = np.delete(g_only_embeddings, idx, axis=0)
    

# Global Variables

g_movie_embeddings = json.load(open("honey_i_shrunk_the_kids_movie_embeddings_1_second.json"))
g_only_embeddings = np.array([emb['embedding'] for emb in g_movie_embeddings])
find_and_remove_intro_and_subtitles(g_only_embeddings, threshold=0.237)


### Cluster Analysis

We will compare clustering with t-SNE (t-Distributed Stochastic Neighbor Embedding) and PCA (Principal Component Analysis) dimensionality reduction algorithms.

#### t-SNE

In [None]:
# Using t-SNE to embed the vectors into 2D
tsne = TSNE(n_components=2, random_state=42)
tSNE_embedded_vectors = tsne.fit_transform(g_only_embeddings)


#### PCA

In [None]:
# Using PCA to embed the vectors into 2D
pca = PCA(n_components=2)
PCA_embedded_vectors = pca.fit_transform(g_only_embeddings)

#### Cluster t-SNE and PCA with K-Means and display Silhoutte Score

**Silhouette Score**: Measures how similar an object is to its own cluster compared to other clusters. The score ranges from -1 to 1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.

Lets find the best clustering number for t-SNE and PCA...

In [None]:
X = tSNE_embedded_vectors

# Range of clusters to try
num_clusters = range(2, 20)

# List to hold silhouette scores
sil_scores = []

# Loop over number of clusters
for k in num_clusters:
    # Perform clustering
    kmeans = KMeans(n_init="auto", n_clusters=k, random_state=42).fit(X)
    
    # Get cluster labels
    labels = kmeans.labels_
    
    # Compute silhouette score and append to list
    sil_score = silhouette_score(X, labels)
    sil_scores.append(sil_score)

# Plot silhouette scores
plt.plot(num_clusters, sil_scores, 'bx-')
plt.title('t-SNE')
plt.xlabel('k (number of clusters)')
plt.ylabel('Silhouette Score')
plt.show()



X = PCA_embedded_vectors

# Range of clusters to try
num_clusters = range(2, 20)

# List to hold silhouette scores
sil_scores = []

# Loop over number of clusters
for k in num_clusters:
    # Perform clustering
    kmeans = KMeans(n_init="auto", n_clusters=k, random_state=42).fit(X)
    
    # Get cluster labels
    labels = kmeans.labels_
    
    # Compute silhouette score and append to list
    sil_score = silhouette_score(X, labels)
    sil_scores.append(sil_score)

# Plot silhouette scores
plt.plot(num_clusters, sil_scores, 'bx-')
plt.title('PCA')
plt.xlabel('k (number of clusters)')
plt.ylabel('Silhouette Score')
plt.show()

Lets cluster with t-SNE and PCA best Silhoutte Scores.

In [None]:
# Performing KMeans clustering with best k silhoutte score.
kmeans = KMeans(n_init="auto", n_clusters=16, random_state=42)
tSNE_clusters = kmeans.fit_predict(tSNE_embedded_vectors)

kmeans = KMeans(n_init="auto", n_clusters=4, random_state=42)
PCA_clusters = kmeans.fit_predict(PCA_embedded_vectors)


# Extracting numbers from file names for labels
labels = [re.search(r'\d+', vector['input']).group() for vector in g_movie_embeddings]

#t-SNE
# Plotting the embedded vectors with cluster coloring
sns.set_theme()
plt.figure(figsize=(12, 8))  # Adjust the figure size as needed
sns.scatterplot(x=tSNE_embedded_vectors[:, 0], y=tSNE_embedded_vectors[:, 1], hue=tSNE_clusters, palette='bright', legend='full', s=100)
for i, vec in enumerate(tSNE_embedded_vectors):
    plt.text(vec[0] + 0.02, vec[1] + 0.02, labels[i], fontsize=6)  # Adding labels
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('t-SNE Embedded Vectors with KMeans Clustering (k=16)')
plt.legend(title='Cluster')
plt.show()

#PCA
# Plotting the embedded vectors with cluster coloring
sns.set_theme()
plt.figure(figsize=(12, 8))  # Adjust the figure size as needed
sns.scatterplot(x=PCA_embedded_vectors[:, 0], y=PCA_embedded_vectors[:, 1], hue=PCA_clusters, palette='bright', legend='full', s=100)
for i, vec in enumerate(PCA_embedded_vectors):
    plt.text(vec[0] + 0.02, vec[1] + 0.02, labels[i], fontsize=6)  # Adding labels
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('PCA Embedded Vectors with KMeans Clustering (k=4)')
plt.legend(title='Cluster')
plt.show()


#### Let sample images in t-SNE clusters

In [None]:
unique_clusters = set(tSNE_clusters)

for cluster in unique_clusters:
    display_cluster_images_first_last_x(tSNE_clusters, cluster, 10, 10)

#### Lets sample images in PCA clusters

In [None]:
unique_clusters = set(PCA_clusters)

for cluster in unique_clusters:
    display_cluster_images_first_last_x(PCA_clusters, cluster, 10, 10)

In [None]:
display_cluster_images_first_last_x(tSNE_clusters, cluster, 192, 100)

In [None]:
# Convert labels to timestamps by dividing by frame rate
# Assuming `frame_rate` is the frame rate of the movie
frame_rate = 24
timestamps = [int(label) / frame_rate for label in labels]

# Create a timeline plot for the t-SNE clusters
plt.figure(figsize=(12, 6))
plt.scatter(timestamps, tSNE_clusters, c=tSNE_clusters, cmap='viridis')
plt.xlabel('Time')
plt.ylabel('Cluster')
plt.title('t-SNE Clusters Over Time')
plt.colorbar(label='Cluster')
plt.show()

# Create a timeline plot for the PCA clusters
plt.figure(figsize=(12, 6))
plt.scatter(timestamps, PCA_clusters, c=PCA_clusters, cmap='viridis')
plt.xlabel('Time')
plt.ylabel('Cluster')
plt.title('PCA Clusters Over Time')
plt.colorbar(label='Cluster')
plt.show()

### Film Description

In 'Honey, I Shrunk the Kids,' an eccentric inventor, Wayne Szalinski, accidentally shrinks his and his neighbor's children with his experimental shrink ray. The miniature kids must navigate a perilous journey across their now-gigantic backyard, encountering obstacles like insects and sprinklers, as they try to return home.

The film is notable for its creative visual effects that magnify ordinary environments into epic landscapes. It's a blend of adventure, humor, and family dynamics, ultimately showcasing the children's resourcefulness and the parents' determination to rescue their kids. The movie was a commercial success and spawned a franchise including sequels and a television series.

### Methods Summary

This section should highlight methods you used in your exploratory analysis. You should include at least one clustering technique or develop another way to relate frames to other frames. You should also consider dimensionality reduction.

Each thumbnail is one frame of the Movie. Each thumbnail has been analyzed and embedded via CLIP. Referenced Model: https://replicate.com/andreasjansson/clip-features/examples .


### Hunches and Hypotheses

This section should summarize the questions that you asked about the film that could potentially be answered by exploratory analysis. You should ask at least three questions.

**Hypothesis 1** 
-   Using Clip to find an object and observe ~5 frames before and after and interpret the results.

**Hypothesis 2** 
-   Scene Consistency and Transition - Frames that are visually and thematically similar cluster together tightly in t-SNE and PCA visualizations, and distinct clusters correspond to different scenes or settings in the movie.
-  **Rationale:** This hypothesis tests the ability of CLIP embeddings, which capture both visual and semantic content, to differentiate between distinct scenes based on their visual content and thematic elements.

**Hypothesis 3**
-   Find a model that can analyze an image and identify how many objects are there. 2 trees, 1 dog, 3 humans, a house...etc

### Results and Interpretation

**Hypothesis 1:**

**Hypothesis 2:** 
- After removing closing credits frames and applying t-SNE and K-means we examined samples from identified clusters and we can observe that frames correspond to specific scenes or types of scenes (indoor vs outdoor, calm vs action-packed). Embeddings effectively capture scene-specific features and can be used to segment the movie based on visual content. Also, clusters seem to reflect the proper timeframe and transition of the movie.




### Reflection

Reflect on your process of analysis. What worked well and did not work well? Describe the limitations of the work and describe what you would work on with more time.