In [1]:
from nltk.corpus import stopwords
import nltk
import re
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [3]:
df_movies = pd.read_csv('../../data/MovieSummaries/movies_metadata_cleaned.csv')

In [5]:
def load_plot_summaries(file_path):
    ids_test = []
    movies_dict = {}

    # Open and read the file
    with open(file_path, 'r', encoding='utf-8') as file:

        # Read each line
        for line in file:
            line = line.strip()
            if '\t' in line:
                movie_id, description = line.split('\t', 1)

                # Store each movie as a dictionary entry
                movies_dict[int(movie_id)] = description
    return movies_dict

In [8]:
ids = df_movies["Wikipedia_movie_ID"].to_list()
movies_dict = load_plot_summaries("../../data/MovieSummaries/plot_summaries.txt")
movies_dict = {k: movies_dict[k] for k in movies_dict if k in ids}
movie_descriptions = list(movies_dict.values())
movie_ids = list(movies_dict.keys())

In [13]:
# Load the model
model_miniLM = SentenceTransformer("all-MiniLM-L6-v2")

In [11]:
movie_embeddings = model_miniLM.encode(movie_descriptions, batch_size=64)

In [14]:
# Encode the query
query_vietnam = model_miniLM.encode(["Vietnam war"])
query_cold_war = model_miniLM.encode(["Cold war nuclear ussr spies soviet"])


In [16]:
similarities_vietnam = cosine_similarity(query_vietnam, movie_embeddings).flatten()
ids_vietnam = [movie_ids[i] for i, sim in enumerate(similarities_vietnam) if sim > 0.44]
df_movies_vietnam = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_vietnam)]
df_movies_vietnam

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
2463,12085775,/m/02vpck6,Eastern Condors,1987.0,,100.0,"['Vietnamese Language', 'Standard Cantonese', ...",['Hong Kong'],"['World cinema', 'Action/Adventure', 'Martial ...",1980.0,Asia,"['Action/Adventure', 'Chinese Movies', 'War', ..."
5197,26953458,/m/0bs3f54,21 and a Wake-Up,,,,[],['United States of America'],"['Drama', 'War film']",,North America,"['Drama', 'War']"
5812,1898832,/m/064q5v,Why We Fight,2005.0,,98.0,['English Language'],"['United States of America', 'France', 'Canada...","['Culture & Society', 'Private military compan...",2000.0,North America,"['Private military company', 'Political', 'Cul..."
6308,1011468,/m/03z106,We Were Soldiers,2002.0,114660784.0,140.0,"['French Language', 'Vietnamese Language', 'En...","['United States of America', 'Germany']","['History', 'Action/Adventure', 'Drama', 'War ...",2000.0,North America,"['Action/Adventure', 'Combat Films', 'Drama', ..."
14990,9846378,/m/02ptqwf,The Rebel,2007.0,,103.0,"['French Language', 'Vietnamese Language']",['Vietnam'],"['Action/Adventure', 'Action', 'Martial Arts F...",2000.0,Asia,"['Action/Adventure', 'World', 'Martial Arts Fi..."
17497,31441972,/m/0gkz943,Dust of Life,2009.0,,90.0,['Vietnamese Language'],[],['Drama'],2000.0,Unknown,['Drama']
18312,22442724,/m/05zpkqr,The Visitors,1972.0,,88.0,['English Language'],['United States of America'],"['Thriller', 'Crime Fiction', 'Drama']",1970.0,North America,"['Drama', 'Thriller', 'Fiction']"
21429,103011,/m/0p_qr,Coming Home,1978.0,32653905.0,126.0,['English Language'],['United States of America'],"['Drama', 'War film', 'Romantic drama', 'Roman...",1970.0,North America,"['Political', 'Drama', 'New Hollywood', 'War',..."
21551,4620459,/m/0cc_y2,"The Boys in Company ""C""",1978.0,,126.0,['English Language'],"['United States of America', 'Hong Kong']","['Drama', 'Chinese Movies', 'War film']",1970.0,North America,"['Chinese Movies', 'Drama', 'War']"
21817,28344566,/m/0cn_39c,How Sleep the Brave,1984.0,,90.0,['English Language'],['United Kingdom'],"['Action', 'Drama', 'War film']",1980.0,Europe,"['Action/Adventure', 'Drama', 'War']"


In [17]:
desc_vietnam = [movie_descriptions[i] for i, sim in enumerate(similarities_vietnam) if sim > 0.44]
desc_vietnam

['{{Plot}} On 07 January 1972, the South Korean base in Nah-Trang, Vietnam, receives a radio transmission from a missing platoon presumed dead. The high-command assigns the veteran and decorated Lieutenant Choi Tae-in to lead a squad with eight other soldiers and rescue the missing soldiers from the R-Point. When they arrive in the location, they have a shooting and defeat a Vietnamese woman with a machine gun in a trench. Later, they find a tombstone telling that one hundred years ago, Chinese killed Vietnamese, dropped them in a lake and built a temple over the place, being a sacred location to the Vietnamese. While chasing the missing soldiers, weird things happen with the rescue team.',
 'The French war cameraman and First Indochina War veteran Schoendoerffer , already famous for his celebrated masterpiece The 317th Platoon, returns to Vietnam. On 1 August 1965, the U.S. 1st Air Cavalry Division is sent to South Vietnam. The following year in September, Schoendoerffer joins it and 

In [18]:
output_file = "df_movies/movies_vietnam.csv"
df_movies_vietnam.to_csv(output_file, index=False)


In [19]:
similarities_cold_war = cosine_similarity(query_cold_war, movie_embeddings).flatten()
ids_cold_war = [movie_ids[i] for i, sim in enumerate(similarities_cold_war) if sim > 0.43]
df_movies_cold_war = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_cold_war)]
df_movies_cold_war

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
2348,33449688,/m/0gkq_0_,Assassin,1969.0,,93.0,[],['South Korea'],"['Thriller', 'Drama']",1960.0,Asia,"['Drama', 'Thriller']"
2600,6446044,/m/0g5pv3,For Your Eyes Only,1981.0,195300000.0,127.0,"['Greek Language', 'Italian Language', 'Englis...",['United Kingdom'],"['Thriller', 'Action/Adventure', 'Glamorized S...",1980.0,Europe,"['Action/Adventure', 'Thriller', 'Spy']"
3241,2154180,/m/06qsdl,Twilight's Last Gleaming,1977.0,,146.0,['English Language'],"['United States of America', 'West Germany', '...","['Thriller', 'Political thriller']",1970.0,North America,['Thriller']
4436,2695729,/m/07yhfx,Police Story 4: First Strike,1996.0,,84.0,"['Russian Language', 'French Language', 'Ukrai...","['United States of America', 'Hong Kong']","['Adventure', 'World cinema', 'Action/Adventur...",1990.0,North America,"['Action/Adventure', 'Chinese Movies', 'Comedy..."
5474,31323864,/m/0gjbgb6,Im Sonderauftrag,,,78.0,['German Language'],['German Democratic Republic'],[],,Europe,[nan]
...,...,...,...,...,...,...,...,...,...,...,...,...
77433,26109551,/m/0b688s_,How I Ended This Summer,2010.0,,124.0,['Russian Language'],['Russia'],"['Thriller', 'Drama', 'World cinema']",2010.0,Europe,"['Drama', 'World', 'Thriller']"
78847,566713,/m/02qrv7,The Living Daylights,1987.0,191200000.0,130.0,"['French Language', 'English Language', 'Arabi...",['United Kingdom'],"['Thriller', 'Action Thrillers', 'Concert film...",1980.0,Europe,"['Thriller', 'Spy', 'Action/Adventure', 'Conce..."
79336,2028297,/m/06g7f3,The Beast of Yucca Flats,1961.0,,54.0,['English Language'],['United States of America'],"['Monster movie', 'Science Fiction', 'B-movie'...",1960.0,North America,"['Science Fiction', 'Monster', 'Black-and-whit..."
80517,12594210,/m/02wx2_x,The Assassination of Trotsky,1972.0,,106.0,"['English Language', 'Spanish Language']","['France', 'Italy', 'United Kingdom']","['Thriller', 'Historical fiction', 'Drama', 'B...",1970.0,Europe,"['Political', 'Thriller', 'Drama', 'Biography'..."


In [20]:
desc_cold_war = [movie_descriptions[i] for i, sim in enumerate(similarities_cold_war) if sim > 0.44]
desc_cold_war 

['A group of journalists are investigating a highly secret document when they uncover a sensational story: that even before the Second World War, in 1938, the first rocket was made in the USSR and Soviet scientists were planning to send an orbiter to the moon and back. The evidence is convincing; it is clear that in this case, Soviet cosmonauts were first. The movie follows the selection and training of a small group of cosmonauts. The one who shines above the others  is Captain Ivan Sergeyevich Kharlamov . He is helped into a space suit and loaded into the capsule, and the rocket lifts off for the Moon—but contact with it is soon lost. Most of the remainder of the film seems to follow the search for information about what happened next, as the 1930s space program appears to have dissolved immediately after, with no reason given . It is implied that Kharlamov returned to Earth, but with no fanfare and apparently no assistance from the space program. A number of men are shown as suspect

In [28]:
df_movies_cold_war.loc[:, 'plot_summary'] = df_movies_cold_war['Wikipedia_movie_ID'].map(movies_dict)
output_file = "movies_cold_war.csv"
df_movies_cold_war.to_csv(output_file, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_cold_war.loc[:, 'plot_summary'] = df_movies_cold_war['Wikipedia_movie_ID'].map(movies_dict)


In [25]:
query_ww2 = model_miniLM.encode([
    "World War II, Second World War, Pacific War, Allied forces, Nazi Germany, Axis powers"
])
similarities_ww2 = cosine_similarity(query_ww2, movie_embeddings).flatten()
ids_ww2 = [movie_ids[i] for i, sim in enumerate(similarities_ww2) if sim > 0.37]
df_movies_ww2= df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_ww2)]
df_movies_ww2

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
2515,19419882,/m/04mxfd5,The Passage,1979.0,,99.0,['English Language'],['United Kingdom'],"['Action', 'Drama', 'War film']",1970.0,Europe,"['Action/Adventure', 'Drama', 'War']"
4503,32512267,/m/0h1bl5f,The Dismissal,1942.0,,,[],['Germany'],"['History', 'Drama']",1940.0,Europe,"['Drama', 'Historical']"
10589,31076473,/m/04j2zw5,So Ends Our Night,1941.0,,117.0,['English Language'],['United States of America'],"['Drama', 'War film']",1940.0,North America,"['Drama', 'War']"
11867,24051101,/m/07kd398,Nazty Nuisance,1943.0,,43.0,['English Language'],['United States of America'],"['Adventure', 'Comedy film', 'Short Film', 'Fa...",1940.0,North America,"['Fantasy', 'Short Film', 'Action/Adventure', ..."
17830,33525798,/m/0hgkx8k,KLK Calling PTZ - The Red Orchestra,1971.0,,101.0,['German Language'],['German Democratic Republic'],['Drama'],1970.0,Europe,['Drama']
18902,26736842,/m/0bmgw43,Identity Unknown,1945.0,,71.0,[],['United States of America'],"['Thriller', 'Drama', 'War film']",1940.0,North America,"['Drama', 'Thriller', 'War']"
19334,28785611,/m/0dd9fw5,Escape to Danger,1943.0,,84.0,['English Language'],['United Kingdom'],['Thriller'],1940.0,Europe,['Thriller']
24918,24486141,/m/080mczx,Eagles Over London,1969.0,,100.0,[],['Italy'],"['Action/Adventure', 'Action', 'World cinema',...",1960.0,Europe,"['Action/Adventure', 'World', 'War']"
27377,20517255,/m/05245vx,SS Girls,1977.0,,95.0,['Italian Language'],['Italy'],"['War film', 'Sexploitation']",1970.0,Europe,"['Sex', 'War']"
31680,26084240,/m/0b6nh1j,Brushfire,1962.0,,80.0,['English Language'],['United States of America'],"['Drama', 'Adventure']",1960.0,North America,"['Action/Adventure', 'Drama']"


In [26]:
df_movies_ww2.loc[:, 'plot_summary'] = df_movies_ww2['Wikipedia_movie_ID'].map(movies_dict)
desc_ww2 = [movie_descriptions[i] for i, sim in enumerate(similarities_ww2) if sim > 0.37]
desc_ww2 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_ww2.loc[:, 'plot_summary'] = df_movies_ww2['Wikipedia_movie_ID'].map(movies_dict)


['A group of ski troopers behind German lines during World War II disrupts Nazi plans and blow up an important bridge.',
 'This documentary film is about the German society, Nazi Germany government and Holocaust during World War II.',
 'During World War II at the height of the Battle of Britain, British military officers are in pursuit of a merciless team of Nazi saboteurs. They searched though war-ravaged London but the Nazis eluded them. Finally, the British caught up with the Germans in a final battle at the RAF Control Centre.',
 "Near the end of World War II a German officer selects ten prostitutes to root out the traitors in Hitler's Third Reich. After many orgies and the execution of disloyal officers, the entire company kill themselves upon hearing of Hitler's death.",
 'The film portrays Roosevelt, Churchill, and Stalin as they maneuver their countries through several of the major events of World War II - such events include the Blitz, Operation Barbarossa, the bombing of Pear

In [27]:
df_movies_ww2.loc[:, 'plot_summary'] = df_movies_ww2['Wikipedia_movie_ID'].map(movies_dict)
output_file = "df_movies/movies_ww2.csv"
df_movies_ww2.to_csv(output_file, index=False)

In [30]:
query_tech2 = model_miniLM.encode(["Technology"])


similarities_tech = cosine_similarity(query_tech2, movie_embeddings).flatten()
ids_tech = [movie_ids[i] for i, sim in enumerate(similarities_tech) if sim > 0.3]
df_movies_tech = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_tech)] #| df_movies['Movie_genres'].str.contains('Science Fiction', na=False)]
df_movies_tech

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
229,30405742,/m/0g5qgd2,Plug & Pray,2010.0,,91.0,"['Japanese Language', 'Italian Language', 'Ger...",['Germany'],"['Science Fiction', 'World cinema', 'Documenta...",2010.0,Europe,"['Science Fiction', 'World', 'Documentary']"
2364,165106,/m/015_1c,Desk Set,1957.0,1700000.0,103.0,['English Language'],['United States of America'],"['Romantic comedy', 'Romance Film', 'Libraries...",1950.0,North America,"['Libraries and librarians', 'Comedy', 'Romance']"
7194,34356357,/m/0h_bl8q,Bharathan Effect,2007.0,,,['Malayalam Language'],['India'],"['Science Fiction', 'Comedy film']",2000.0,Asia,"['Science Fiction', 'Comedy']"
9496,1446852,/m/0528fs,More,1998.0,,6.0,[],['United States of America'],"['Stop motion', 'Short Film', 'Science Fiction...",1990.0,North America,"['Science Fiction', 'Stop motion', 'Short Film..."
15159,15033858,/m/03h5crb,Sleep Dealer,2008.0,,90.0,"['English Language', 'Spanish Language']","['United States of America', 'Mexico']","['Science Fiction', 'Drama', 'Indie', 'World c...",2000.0,North America,"['Indie', 'Science Fiction', 'World', 'Drama']"
15925,26364940,/m/0bbx9xn,Trapped by Television,1936.0,,62.0,['English Language'],['United States of America'],"['Drama', 'Science Fiction', 'Screwball comedy...",1930.0,North America,"['Science Fiction', 'Comedy', 'Drama', 'Romance']"
20142,21689589,/m/05mzmkb,Us Now,2009.0,,59.0,['English Language'],['United Kingdom'],"['Drama', 'Documentary']",2000.0,Europe,"['Drama', 'Documentary']"
20584,35131540,/m/0j64rfm,Bioscope,2008.0,,94.0,['Malayalam Language'],['India'],['Drama'],2000.0,Asia,['Drama']
22370,31644556,/m/091m5df,Television Spy,1939.0,,58.0,['English Language'],['United States of America'],"['Action', 'Spy']",1930.0,North America,"['Action/Adventure', 'Spy']"
23691,7037167,/m/0h1fzy,Algol,1920.0,,99.0,['German Language'],['Germany'],"['Silent film', 'Fantasy', 'Science Fiction', ...",1920.0,Europe,"['Science Fiction', 'Black-and-white', 'Fantas..."


In [31]:
desc_tech = [movie_descriptions[i] for i, sim in enumerate(similarities_tech) if sim > 0.25]
desc_tech

["Computer experts around the world strive towards the development of intelligent robots. Pioneers like Raymond Kurzweil and Hiroshi Ishiguro dream of fashioning intelligent machines that will equal their human creators. In this potential reality, man and machine merge as a single unity. Rejecting evolution's biological shackles tantalisingly dangles the promise of eternal life for those bold enough to seize it. But others, like Joseph Weizenbaum, counter attack against society's limitless faith in the redemptive powers of technology. Eloquent and tactful, he questions the prevailing discourses on new technologies, and their ethical relationships to human life. The film delves into a world where computer technology, robotics, biology, neuroscience, and developmental psychology merge and features the world’s leading roboticists in their laboratories in Japan, the USA, Italy and Germany.",
 "Coinciding with the worst drought ever, much of the world's water is polluted. The evil Botijola 

In [32]:
df_movies_tech.loc[:, 'plot_summary'] = df_movies_tech['Wikipedia_movie_ID'].map(movies_dict)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_tech.loc[:, 'plot_summary'] = df_movies_tech['Wikipedia_movie_ID'].map(movies_dict)


In [33]:
df_movies_tech['plot_summary']

229      Computer experts around the world strive towar...
2364     Desk Set takes place at the "Federal Broadcast...
7194     Bharathan is a post-graduate and unemployed. S...
9496     More tells the story of an inventor who lives ...
15159    'Sleep Dealer' is set in a future, militarized...
15925    An inventor is working on his latest creation,...
20142    The Us Now website describes the project as an...
20584    Bioscope, set in the early years of the twenti...
22370    A scientist invents a television called the Ic...
23691     The story follows the life of Robert Herne, w...
25433    The film takes place at an undetermined point ...
26480    A scientific genius has invented a machine cap...
26947    Venkat Ramakrishnan ([[Srikanth , Sevarkodi Se...
29461    The NSA-funded QT  Corporation has slated a pr...
30972    A highly advanced computer witnesses a murder ...
31404    With contributions from over 50 politicians, s...
32353    Teenager Karen Braden  is a troubled mental ho.

In [35]:
output_file = "df_movies/movies_tech.csv"
df_movies_tech.to_csv(output_file, index=False)