In [None]:
!pip install selenium
!pip install selenium webdriver-manager

# Install Google Chrome browser for Selenium to use
!wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
!echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list
!apt-get update
!apt-get install -y google-chrome-stable

OK
Get:1 http://dl.google.com/linux/chrome/deb stable InRelease [1,825 B]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:4 https://cli.github.com/packages stable InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:7 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:9 http://dl.google.com/linux/chrome/deb stable/main amd64 Packages [1,212 B]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 3,037 B in 1s (2,725 B/s)
Reading package lists... Done
W: http://dl.google.com/linux/chrome/deb/dists/stable/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.
W

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager

# ----------------------------
# Chrome configuration
# ----------------------------
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080") # Define a window size
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--proxy-server='direct://'")
chrome_options.add_argument("--proxy-bypass-list=*")
chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
)
chrome_options.binary_location = '/usr/bin/google-chrome' # Set binary location to google-chrome

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=chrome_options
)

wait = WebDriverWait(driver, 15)

# ----------------------------
# IMDb URL (2024 movies)
# ----------------------------
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31"
driver.get(url)

# ----------------------------
# Wait for first batch
# ----------------------------
wait.until(
    EC.presence_of_all_elements_located(
        (By.XPATH, '//li[contains(@class,"ipc-metadata-list-summary-item")]')
    )
)

# ----------------------------
# Click "Load more" until finished
# ----------------------------
last_movie_count = 0
while True:
    # Get current number of movie items
    movie_items_on_page = driver.find_elements(
        By.XPATH,
        '//li[contains(@class,"ipc-metadata-list-summary-item")]'
    )
    current_movie_count = len(movie_items_on_page)

    # Break if no new movies were loaded in the last iteration
    if current_movie_count == last_movie_count and current_movie_count > 0:
        print("No new movies loaded, breaking loop.")
        break

    last_movie_count = current_movie_count

    try:
        # Updated XPath for the 'Show More' button, specifically looking for '50 more'
        load_more = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '//button[.//span[normalize-space(text())="50 more"]]')
            )
        )

        # Scroll into view
        driver.execute_script(
            "arguments[0].scrollIntoView({behavior:'smooth', block:'center'});",
            load_more
        )
        time.sleep(1) # Small delay after scrolling

        load_more.click()
        time.sleep(5) # Increased time to allow new content to load more reliably

    except TimeoutException:
        # No more "Load more" button, or button not clickable after waiting
        print("Load More button not found or not clickable, breaking loop.")
        break
    except StaleElementReferenceException:
        # If the button becomes stale, re-find it in the next iteration
        print("StaleElementReferenceException caught, retrying...")
        time.sleep(1) # Wait a bit before retrying
        continue

# ----------------------------
# Scrape all movie titles and storylines
# ----------------------------
movie_items = driver.find_elements(
    By.XPATH,
    '//li[contains(@class,"ipc-metadata-list-summary-item")]'
)

print("Total movies scraped:", len(movie_items))

titles = []
storyLines = []
seen_titles = set() # Use a set to track unique titles to avoid duplicates

for movie in movie_items:
    try:
        title = movie.find_element(
            By.XPATH,
            './/h3[contains(@class,"ipc-title__text")]'
        ).text

        # Only append title and storyline if the title hasn't been seen yet
        if title not in seen_titles:
            storyLine = movie.find_element(
                By.XPATH,
                './/div[contains(@class,"ipc-html-content-inner-div")]'
            ).text

            titles.append(title)
            storyLines.append(storyLine)
            seen_titles.add(title) # Add the title to the set of seen titles

    except StaleElementReferenceException:
        continue
    except Exception:
        # Catch other exceptions like NoSuchElementException if an element is missing for a movie
        continue

# ----------------------------
# Create DataFrame
# ----------------------------
df = pd.DataFrame({"Title": titles,"StoryLine":storyLines})

# Close browser
driver.quit()


# # Save the DataFrame to a CSV file (optional)
df.to_csv('imdb_movies_2024.csv', index=False)

# Show output
df

No new movies loaded, breaking loop.
Total movies scraped: 5250


Unnamed: 0,Title,StoryLine
0,1. The Substance,A fading celebrity takes a black-market drug: ...
1,2. The Life of Chuck,"A life-affirming, genre-bending story about th..."
2,3. Bone Lake,A couple's vacation at a secluded estate is up...
3,4. Anora,A young stripper from Brooklyn meets and impul...
4,5. Eden,Based on a factual account of a group of outsi...
...,...,...
5094,5246. Juni,Partha is a young chef in Bangalore who falls ...
5095,5247. Timing,Two young ex-lovers randomly reconnect.
5096,5248. Lakatabu,When a man begins to use his powers for crimin...
5097,5249. Emakku Thozhil Romance,Assistant director Umashankar falls for Leo. M...


In [9]:
import pandas as pd

imdb_df = pd.read_csv('/content/imdb_movies_2024.csv')
imdb_df

Unnamed: 0,Movie Name,Storyline
0,1. The Substance,A fading celebrity takes a black-market drug: ...
1,2. The Life of Chuck,"A life-affirming, genre-bending story about th..."
2,3. Bone Lake,A couple's vacation at a secluded estate is up...
3,4. Anora,A young stripper from Brooklyn meets and impul...
4,5. Eden,Based on a factual account of a group of outsi...
...,...,...
5094,5246. Juni,Partha is a young chef in Bangalore who falls ...
5095,5247. Timing,Two young ex-lovers randomly reconnect.
5096,5248. Lakatabu,When a man begins to use his powers for crimin...
5097,5249. Emakku Thozhil Romance,Assistant director Umashankar falls for Leo. M...


In [10]:
imdb_df['Movie Name'] = imdb_df['Movie Name'].str.replace(r'^\d+\.\s*', '', regex=True)

In [11]:
imdb_df

Unnamed: 0,Movie Name,Storyline
0,The Substance,A fading celebrity takes a black-market drug: ...
1,The Life of Chuck,"A life-affirming, genre-bending story about th..."
2,Bone Lake,A couple's vacation at a secluded estate is up...
3,Anora,A young stripper from Brooklyn meets and impul...
4,Eden,Based on a factual account of a group of outsi...
...,...,...
5094,Juni,Partha is a young chef in Bangalore who falls ...
5095,Timing,Two young ex-lovers randomly reconnect.
5096,Lakatabu,When a man begins to use his powers for crimin...
5097,Emakku Thozhil Romance,Assistant director Umashankar falls for Leo. M...


In [12]:
print(imdb_df.head())
print(imdb_df.info())

          Movie Name                                          Storyline
0      The Substance  A fading celebrity takes a black-market drug: ...
1  The Life of Chuck  A life-affirming, genre-bending story about th...
2          Bone Lake  A couple's vacation at a secluded estate is up...
3              Anora  A young stripper from Brooklyn meets and impul...
4               Eden  Based on a factual account of a group of outsi...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5099 entries, 0 to 5098
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Movie Name  5099 non-null   object
 1   Storyline   5099 non-null   object
dtypes: object(2)
memory usage: 79.8+ KB
None


In [17]:
import re
import string
import nltk
from nltk.corpus import stopwords

# Attempt to download stopwords
try:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
except Exception as e:
    print(f"NLTK download failed: {e}")
    # Fallback to a basic list of English stop words
    stop_words = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"])

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers/unnecessary characters (optional, but requested "unnecessary characters")
    text = re.sub(r'\d+', '', text)
    # Remove stop words
    words = text.split()
    words = [w for w in words if w not in stop_words]
    # Join and strip
    return " ".join(words).strip()

# Apply cleaning
imdb_df['Cleaned_Storyline'] = imdb_df['Storyline'].apply(clean_text)

# Show progress
print(imdb_df[['Storyline', 'Cleaned_Storyline']].head())
imdb_df

                                           Storyline  \
0  A fading celebrity takes a black-market drug: ...   
1  A life-affirming, genre-bending story about th...   
2  A couple's vacation at a secluded estate is up...   
3  A young stripper from Brooklyn meets and impul...   
4  Based on a factual account of a group of outsi...   

                                   Cleaned_Storyline  
0  fading celebrity takes blackmarket drug cellre...  
1  lifeaffirming genrebending story three chapter...  
2  couples vacation secluded estate upended theyr...  
3  young stripper brooklyn meets impulsively marr...  
4  based factual account group outsiders settle r...  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Movie Name,Storyline,Cleaned_Storyline
0,The Substance,A fading celebrity takes a black-market drug: ...,fading celebrity takes blackmarket drug cellre...
1,The Life of Chuck,"A life-affirming, genre-bending story about th...",lifeaffirming genrebending story three chapter...
2,Bone Lake,A couple's vacation at a secluded estate is up...,couples vacation secluded estate upended theyr...
3,Anora,A young stripper from Brooklyn meets and impul...,young stripper brooklyn meets impulsively marr...
4,Eden,Based on a factual account of a group of outsi...,based factual account group outsiders settle r...
...,...,...,...
5094,Juni,Partha is a young chef in Bangalore who falls ...,partha young chef bangalore falls juni regular...
5095,Timing,Two young ex-lovers randomly reconnect.,two young exlovers randomly reconnect
5096,Lakatabu,When a man begins to use his powers for crimin...,man begins use powers criminal activities thre...
5097,Emakku Thozhil Romance,Assistant director Umashankar falls for Leo. M...,assistant director umashankar falls leo misund...


In [18]:
imdb_df.to_csv('cleaned_imdb_movies_2024.csv', index=False)

In [20]:
# Tokenize the Cleaned_Storyline
imdb_df['Tokens'] = imdb_df['Cleaned_Storyline'].fillna('').apply(lambda x: x.split())
imdb_df.to_csv('tokenized_imdb_movies_2024.csv', index=False)

print(imdb_df[['Movie Name', 'Cleaned_Storyline', 'Tokens']].head())

          Movie Name                                  Cleaned_Storyline  \
0      The Substance  fading celebrity takes blackmarket drug cellre...   
1  The Life of Chuck  lifeaffirming genrebending story three chapter...   
2          Bone Lake  couples vacation secluded estate upended theyr...   
3              Anora  young stripper brooklyn meets impulsively marr...   
4               Eden  based factual account group outsiders settle r...   

                                              Tokens  
0  [fading, celebrity, takes, blackmarket, drug, ...  
1  [lifeaffirming, genrebending, story, three, ch...  
2  [couples, vacation, secluded, estate, upended,...  
3  [young, stripper, brooklyn, meets, impulsively...  
4  [based, factual, account, group, outsiders, se...  


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [28]:
cleaned_imdb_df = pd.read_csv('/content/cleaned_imdb_movies_2024.csv')
cleaned_imdb_df['Cleaned_Storyline'] = cleaned_imdb_df['Cleaned_Storyline'].fillna('')

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

top_n = 5
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_imdb_df['Cleaned_Storyline'])

user_input = "A broker of lucrative payoffs between corrupt corporations and the individuals who threaten them breaks his own rules."
cleaned_user_input = clean_text(user_input)
user_vector = vectorizer.transform([cleaned_user_input])

cosine_sim_user = cosine_similarity(user_vector, tfidf_matrix)
top_indices = cosine_sim_user.argsort()[0][-top_n:][::-1]

top_5_imdb_user_movie_recommendations = cleaned_imdb_df.iloc[top_indices]
print(top_5_imdb_user_movie_recommendations)


         Movie Name                                          Storyline  \
5             Relay  A broker of lucrative payoffs between corrupt ...   
1990    Sorgavaasal  A common man imprisoned in a corrupt system qu...   
4006  The Shakedown  When his mistress threatens to expose the secr...   
3157   Dandupalayam  Based on the infamous criminal gang that murde...   
2383         Bheema  Bheema, an orphan raised by Ramanna, crosses p...   

                                      Cleaned_Storyline  
5     broker lucrative payoffs corrupt corporations ...  
1990  common man imprisoned corrupt system questions...  
4006  mistress threatens expose secret affair respec...  
3157  based infamous criminal gang murdered countles...  
2383  bheema orphan raised ramanna crosses paths dra...  


In [36]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_imdb_df['Cleaned_Storyline'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

result_df = pd.concat([cleaned_imdb_df[['Movie Name']], tfidf_df], axis=1)
result_df.to_csv('tfidf_imdb_vectors.csv', index=False)

print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")
print("Top 10 features (words) in the vocabulary:")
print(tfidf_vectorizer.get_feature_names_out()[:10])

TF-IDF Matrix Shape: (5099, 1000)
Top 10 features (words) in the vocabulary:
['abandoned' 'accident' 'accidentally' 'accused' 'across' 'act' 'actor'
 'actress' 'adult' 'adventure']


In [38]:
# Calculate Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

top_n = 5
results = []

indices = pd.Series(result_df.index, index=result_df['Movie Name']).drop_duplicates()

for idx, row in result_df.iterrows():
    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    movie_indices = [i[0] for i in sim_scores]
    similarity_values = [i[1] for i in sim_scores]

    similar_movies = result_df['Movie Name'].iloc[movie_indices].values
    results.append({
        'Movie Name': row['Movie Name'],
        'Top 1 Similar': similar_movies[0] if len(similar_movies) > 0 else None,
        'Score 1': similarity_values[0] if len(similarity_values) > 0 else 0,
        'Top 2 Similar': similar_movies[1] if len(similar_movies) > 1 else None,
        'Score 2': similarity_values[1] if len(similarity_values) > 1 else 0,
        'Top 3 Similar': similar_movies[2] if len(similar_movies) > 2 else None,
        'Score 3': similarity_values[2] if len(similarity_values) > 2 else 0,
        'Top 4 Similar': similar_movies[3] if len(similar_movies) > 3 else None,
        'Score 4': similarity_values[3] if len(similarity_values) > 3 else 0,
        'Top 5 Similar': similar_movies[4] if len(similar_movies) > 3 else None,
        'Score 5': similarity_values[4] if len(similarity_values) > 3 else 0
    })

similarity_df = pd.DataFrame(results)
similarity_df.to_csv('movie_similarity_results.csv', index=False)

print(similarity_df.head(10))

          Movie Name                   Top 1 Similar   Score 1  \
0      The Substance  The Opera! Arie per un'eclissi  0.144300   
1  The Life of Chuck                  Depth of Field  0.185673   
2          Bone Lake                        Get Away  0.197535   
3              Anora                  Out of Control  0.169792   
4               Eden                          Bagman  0.154680   
5              Relay                   The Shakedown  0.144765   
6     Dune: Part Two                           Obraz  0.153214   
7          Civil War            Dreams in Nightmares  0.181632   
8  He's Watching You                   Crescent City  0.222426   
9       Gladiator II                      Gladiators  0.156869   

                 Top 2 Similar   Score 2                Top 3 Similar  \
0            Look to the Light  0.126998                 A Good Man 2   
1  Nanban Oruvan Vantha Piragu  0.129884              Sister Midnight   
2                Ten Toes Down  0.141703              

In [35]:
# Load the similarity results - For Ranking
similarity_df = pd.read_csv('/content/movie_similarity_results.csv')

ranked_pairs = similarity_df[['Movie Name', 'Top 1 Similar', 'Score 1']].sort_values(by='Score 1', ascending=False)
ranked_pairs.columns = ['Movie A', 'Movie B', 'Similarity Score']
ranked_pairs.to_csv('ranked_movie_similarities.csv', index=False)

print("Top 10 Most Similar Movie Pairs:")
print(ranked_pairs.head(10))

Top 10 Most Similar Movie Pairs:
                                                Movie A  \
630   Justice League: Crisis on Infinite Earths - Pa...   
409                                      Piece by Piece   
5009                                       Satyashodhak   
455   Justice League: Crisis on Infinite Earths - Pa...   
3496                                     Rebirth Island   
4304                    Cruel Summer III: Pray for Fall   
1055                                                Por   
2804                                              Dange   
474                                         Silent Love   
675                                         Uranus 2324   

                                                Movie B  Similarity Score  
630   Justice League: Crisis on Infinite Earths - Pa...          1.000000  
409                                        Satyashodhak          1.000000  
5009                                       Satyashodhak          1.000000  
455   Justice