In [1]:
from nltk.corpus import stopwords
import nltk
import re
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [3]:
df_movies = pd.read_csv('../../data/MovieSummaries/movies_metadata_cleaned.csv')

In [5]:
def load_plot_summaries(file_path):
    ids_test = []
    movies_dict = {}

    # Open and read the file
    with open(file_path, 'r', encoding='utf-8') as file:

        # Read each line
        for line in file:
            line = line.strip()
            if '\t' in line:
                movie_id, description = line.split('\t', 1)

                # Store each movie as a dictionary entry
                movies_dict[int(movie_id)] = description
    return movies_dict

In [8]:
ids = df_movies["Wikipedia_movie_ID"].to_list()
movies_dict = load_plot_summaries("../../data/MovieSummaries/plot_summaries.txt")
movies_dict = {k: movies_dict[k] for k in movies_dict if k in ids}
movie_descriptions = list(movies_dict.values())
movie_ids = list(movies_dict.keys())

In [13]:
# Load the model
model_miniLM = SentenceTransformer("all-MiniLM-L6-v2")

In [11]:
movie_embeddings = model_miniLM.encode(movie_descriptions, batch_size=64)

In [74]:
np.save("movie_embeddings.npy", movie_embeddings)

In [14]:
# Encode the query
query_vietnam = model_miniLM.encode(["Vietnam war"])
query_cold_war = model_miniLM.encode(["Cold war nuclear ussr spies soviet"])


In [16]:
similarities_vietnam = cosine_similarity(query_vietnam, movie_embeddings).flatten()
ids_vietnam = [movie_ids[i] for i, sim in enumerate(similarities_vietnam) if sim > 0.44]
df_movies_vietnam = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_vietnam)]
df_movies_vietnam

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
2463,12085775,/m/02vpck6,Eastern Condors,1987.0,,100.0,"['Vietnamese Language', 'Standard Cantonese', ...",['Hong Kong'],"['World cinema', 'Action/Adventure', 'Martial ...",1980.0,Asia,"['Action/Adventure', 'Chinese Movies', 'War', ..."
5197,26953458,/m/0bs3f54,21 and a Wake-Up,,,,[],['United States of America'],"['Drama', 'War film']",,North America,"['Drama', 'War']"
5812,1898832,/m/064q5v,Why We Fight,2005.0,,98.0,['English Language'],"['United States of America', 'France', 'Canada...","['Culture & Society', 'Private military compan...",2000.0,North America,"['Private military company', 'Political', 'Cul..."
6308,1011468,/m/03z106,We Were Soldiers,2002.0,114660784.0,140.0,"['French Language', 'Vietnamese Language', 'En...","['United States of America', 'Germany']","['History', 'Action/Adventure', 'Drama', 'War ...",2000.0,North America,"['Action/Adventure', 'Combat Films', 'Drama', ..."
14990,9846378,/m/02ptqwf,The Rebel,2007.0,,103.0,"['French Language', 'Vietnamese Language']",['Vietnam'],"['Action/Adventure', 'Action', 'Martial Arts F...",2000.0,Asia,"['Action/Adventure', 'World', 'Martial Arts Fi..."
17497,31441972,/m/0gkz943,Dust of Life,2009.0,,90.0,['Vietnamese Language'],[],['Drama'],2000.0,Unknown,['Drama']
18312,22442724,/m/05zpkqr,The Visitors,1972.0,,88.0,['English Language'],['United States of America'],"['Thriller', 'Crime Fiction', 'Drama']",1970.0,North America,"['Drama', 'Thriller', 'Fiction']"
21429,103011,/m/0p_qr,Coming Home,1978.0,32653905.0,126.0,['English Language'],['United States of America'],"['Drama', 'War film', 'Romantic drama', 'Roman...",1970.0,North America,"['Political', 'Drama', 'New Hollywood', 'War',..."
21551,4620459,/m/0cc_y2,"The Boys in Company ""C""",1978.0,,126.0,['English Language'],"['United States of America', 'Hong Kong']","['Drama', 'Chinese Movies', 'War film']",1970.0,North America,"['Chinese Movies', 'Drama', 'War']"
21817,28344566,/m/0cn_39c,How Sleep the Brave,1984.0,,90.0,['English Language'],['United Kingdom'],"['Action', 'Drama', 'War film']",1980.0,Europe,"['Action/Adventure', 'Drama', 'War']"


In [17]:
desc_vietnam = [movie_descriptions[i] for i, sim in enumerate(similarities_vietnam) if sim > 0.44]
desc_vietnam

['{{Plot}} On 07 January 1972, the South Korean base in Nah-Trang, Vietnam, receives a radio transmission from a missing platoon presumed dead. The high-command assigns the veteran and decorated Lieutenant Choi Tae-in to lead a squad with eight other soldiers and rescue the missing soldiers from the R-Point. When they arrive in the location, they have a shooting and defeat a Vietnamese woman with a machine gun in a trench. Later, they find a tombstone telling that one hundred years ago, Chinese killed Vietnamese, dropped them in a lake and built a temple over the place, being a sacred location to the Vietnamese. While chasing the missing soldiers, weird things happen with the rescue team.',
 'The French war cameraman and First Indochina War veteran Schoendoerffer , already famous for his celebrated masterpiece The 317th Platoon, returns to Vietnam. On 1 August 1965, the U.S. 1st Air Cavalry Division is sent to South Vietnam. The following year in September, Schoendoerffer joins it and 

In [18]:
output_file = "df_movies/movies_vietnam.csv"
df_movies_vietnam.to_csv(output_file, index=False)


In [19]:
similarities_cold_war = cosine_similarity(query_cold_war, movie_embeddings).flatten()
ids_cold_war = [movie_ids[i] for i, sim in enumerate(similarities_cold_war) if sim > 0.43]
df_movies_cold_war = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_cold_war)]
df_movies_cold_war

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
2348,33449688,/m/0gkq_0_,Assassin,1969.0,,93.0,[],['South Korea'],"['Thriller', 'Drama']",1960.0,Asia,"['Drama', 'Thriller']"
2600,6446044,/m/0g5pv3,For Your Eyes Only,1981.0,195300000.0,127.0,"['Greek Language', 'Italian Language', 'Englis...",['United Kingdom'],"['Thriller', 'Action/Adventure', 'Glamorized S...",1980.0,Europe,"['Action/Adventure', 'Thriller', 'Spy']"
3241,2154180,/m/06qsdl,Twilight's Last Gleaming,1977.0,,146.0,['English Language'],"['United States of America', 'West Germany', '...","['Thriller', 'Political thriller']",1970.0,North America,['Thriller']
4436,2695729,/m/07yhfx,Police Story 4: First Strike,1996.0,,84.0,"['Russian Language', 'French Language', 'Ukrai...","['United States of America', 'Hong Kong']","['Adventure', 'World cinema', 'Action/Adventur...",1990.0,North America,"['Action/Adventure', 'Chinese Movies', 'Comedy..."
5474,31323864,/m/0gjbgb6,Im Sonderauftrag,,,78.0,['German Language'],['German Democratic Republic'],[],,Europe,[nan]
...,...,...,...,...,...,...,...,...,...,...,...,...
77433,26109551,/m/0b688s_,How I Ended This Summer,2010.0,,124.0,['Russian Language'],['Russia'],"['Thriller', 'Drama', 'World cinema']",2010.0,Europe,"['Drama', 'World', 'Thriller']"
78847,566713,/m/02qrv7,The Living Daylights,1987.0,191200000.0,130.0,"['French Language', 'English Language', 'Arabi...",['United Kingdom'],"['Thriller', 'Action Thrillers', 'Concert film...",1980.0,Europe,"['Thriller', 'Spy', 'Action/Adventure', 'Conce..."
79336,2028297,/m/06g7f3,The Beast of Yucca Flats,1961.0,,54.0,['English Language'],['United States of America'],"['Monster movie', 'Science Fiction', 'B-movie'...",1960.0,North America,"['Science Fiction', 'Monster', 'Black-and-whit..."
80517,12594210,/m/02wx2_x,The Assassination of Trotsky,1972.0,,106.0,"['English Language', 'Spanish Language']","['France', 'Italy', 'United Kingdom']","['Thriller', 'Historical fiction', 'Drama', 'B...",1970.0,Europe,"['Political', 'Thriller', 'Drama', 'Biography'..."


In [20]:
desc_cold_war = [movie_descriptions[i] for i, sim in enumerate(similarities_cold_war) if sim > 0.44]
desc_cold_war 

['A group of journalists are investigating a highly secret document when they uncover a sensational story: that even before the Second World War, in 1938, the first rocket was made in the USSR and Soviet scientists were planning to send an orbiter to the moon and back. The evidence is convincing; it is clear that in this case, Soviet cosmonauts were first. The movie follows the selection and training of a small group of cosmonauts. The one who shines above the others  is Captain Ivan Sergeyevich Kharlamov . He is helped into a space suit and loaded into the capsule, and the rocket lifts off for the Moon—but contact with it is soon lost. Most of the remainder of the film seems to follow the search for information about what happened next, as the 1930s space program appears to have dissolved immediately after, with no reason given . It is implied that Kharlamov returned to Earth, but with no fanfare and apparently no assistance from the space program. A number of men are shown as suspect

In [28]:
df_movies_cold_war.loc[:, 'plot_summary'] = df_movies_cold_war['Wikipedia_movie_ID'].map(movies_dict)
output_file = "movies_cold_war.csv"
df_movies_cold_war.to_csv(output_file, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_cold_war.loc[:, 'plot_summary'] = df_movies_cold_war['Wikipedia_movie_ID'].map(movies_dict)


In [25]:
query_ww2 = model_miniLM.encode([
    "World War II, Second World War, Pacific War, Allied forces, Nazi Germany, Axis powers"
])
similarities_ww2 = cosine_similarity(query_ww2, movie_embeddings).flatten()
ids_ww2 = [movie_ids[i] for i, sim in enumerate(similarities_ww2) if sim > 0.37]
df_movies_ww2= df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_ww2)]
df_movies_ww2

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
2515,19419882,/m/04mxfd5,The Passage,1979.0,,99.0,['English Language'],['United Kingdom'],"['Action', 'Drama', 'War film']",1970.0,Europe,"['Action/Adventure', 'Drama', 'War']"
4503,32512267,/m/0h1bl5f,The Dismissal,1942.0,,,[],['Germany'],"['History', 'Drama']",1940.0,Europe,"['Drama', 'Historical']"
10589,31076473,/m/04j2zw5,So Ends Our Night,1941.0,,117.0,['English Language'],['United States of America'],"['Drama', 'War film']",1940.0,North America,"['Drama', 'War']"
11867,24051101,/m/07kd398,Nazty Nuisance,1943.0,,43.0,['English Language'],['United States of America'],"['Adventure', 'Comedy film', 'Short Film', 'Fa...",1940.0,North America,"['Fantasy', 'Short Film', 'Action/Adventure', ..."
17830,33525798,/m/0hgkx8k,KLK Calling PTZ - The Red Orchestra,1971.0,,101.0,['German Language'],['German Democratic Republic'],['Drama'],1970.0,Europe,['Drama']
18902,26736842,/m/0bmgw43,Identity Unknown,1945.0,,71.0,[],['United States of America'],"['Thriller', 'Drama', 'War film']",1940.0,North America,"['Drama', 'Thriller', 'War']"
19334,28785611,/m/0dd9fw5,Escape to Danger,1943.0,,84.0,['English Language'],['United Kingdom'],['Thriller'],1940.0,Europe,['Thriller']
24918,24486141,/m/080mczx,Eagles Over London,1969.0,,100.0,[],['Italy'],"['Action/Adventure', 'Action', 'World cinema',...",1960.0,Europe,"['Action/Adventure', 'World', 'War']"
27377,20517255,/m/05245vx,SS Girls,1977.0,,95.0,['Italian Language'],['Italy'],"['War film', 'Sexploitation']",1970.0,Europe,"['Sex', 'War']"
31680,26084240,/m/0b6nh1j,Brushfire,1962.0,,80.0,['English Language'],['United States of America'],"['Drama', 'Adventure']",1960.0,North America,"['Action/Adventure', 'Drama']"


In [26]:
df_movies_ww2.loc[:, 'plot_summary'] = df_movies_ww2['Wikipedia_movie_ID'].map(movies_dict)
desc_ww2 = [movie_descriptions[i] for i, sim in enumerate(similarities_ww2) if sim > 0.37]
desc_ww2 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_ww2.loc[:, 'plot_summary'] = df_movies_ww2['Wikipedia_movie_ID'].map(movies_dict)


['A group of ski troopers behind German lines during World War II disrupts Nazi plans and blow up an important bridge.',
 'This documentary film is about the German society, Nazi Germany government and Holocaust during World War II.',
 'During World War II at the height of the Battle of Britain, British military officers are in pursuit of a merciless team of Nazi saboteurs. They searched though war-ravaged London but the Nazis eluded them. Finally, the British caught up with the Germans in a final battle at the RAF Control Centre.',
 "Near the end of World War II a German officer selects ten prostitutes to root out the traitors in Hitler's Third Reich. After many orgies and the execution of disloyal officers, the entire company kill themselves upon hearing of Hitler's death.",
 'The film portrays Roosevelt, Churchill, and Stalin as they maneuver their countries through several of the major events of World War II - such events include the Blitz, Operation Barbarossa, the bombing of Pear

In [27]:
df_movies_ww2.loc[:, 'plot_summary'] = df_movies_ww2['Wikipedia_movie_ID'].map(movies_dict)
output_file = "df_movies/movies_ww2.csv"
df_movies_ww2.to_csv(output_file, index=False)

In [30]:
query_tech2 = model_miniLM.encode(["Technology"])

similarities_tech = cosine_similarity(query_tech2, movie_embeddings).flatten()
ids_tech = [movie_ids[i] for i, sim in enumerate(similarities_tech) if sim > 0.3]
df_movies_tech = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_tech)] #| df_movies['Movie_genres'].str.contains('Science Fiction', na=False)]
df_movies_tech

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
229,30405742,/m/0g5qgd2,Plug & Pray,2010.0,,91.0,"['Japanese Language', 'Italian Language', 'Ger...",['Germany'],"['Science Fiction', 'World cinema', 'Documenta...",2010.0,Europe,"['Science Fiction', 'World', 'Documentary']"
2364,165106,/m/015_1c,Desk Set,1957.0,1700000.0,103.0,['English Language'],['United States of America'],"['Romantic comedy', 'Romance Film', 'Libraries...",1950.0,North America,"['Libraries and librarians', 'Comedy', 'Romance']"
7194,34356357,/m/0h_bl8q,Bharathan Effect,2007.0,,,['Malayalam Language'],['India'],"['Science Fiction', 'Comedy film']",2000.0,Asia,"['Science Fiction', 'Comedy']"
9496,1446852,/m/0528fs,More,1998.0,,6.0,[],['United States of America'],"['Stop motion', 'Short Film', 'Science Fiction...",1990.0,North America,"['Science Fiction', 'Stop motion', 'Short Film..."
15159,15033858,/m/03h5crb,Sleep Dealer,2008.0,,90.0,"['English Language', 'Spanish Language']","['United States of America', 'Mexico']","['Science Fiction', 'Drama', 'Indie', 'World c...",2000.0,North America,"['Indie', 'Science Fiction', 'World', 'Drama']"
15925,26364940,/m/0bbx9xn,Trapped by Television,1936.0,,62.0,['English Language'],['United States of America'],"['Drama', 'Science Fiction', 'Screwball comedy...",1930.0,North America,"['Science Fiction', 'Comedy', 'Drama', 'Romance']"
20142,21689589,/m/05mzmkb,Us Now,2009.0,,59.0,['English Language'],['United Kingdom'],"['Drama', 'Documentary']",2000.0,Europe,"['Drama', 'Documentary']"
20584,35131540,/m/0j64rfm,Bioscope,2008.0,,94.0,['Malayalam Language'],['India'],['Drama'],2000.0,Asia,['Drama']
22370,31644556,/m/091m5df,Television Spy,1939.0,,58.0,['English Language'],['United States of America'],"['Action', 'Spy']",1930.0,North America,"['Action/Adventure', 'Spy']"
23691,7037167,/m/0h1fzy,Algol,1920.0,,99.0,['German Language'],['Germany'],"['Silent film', 'Fantasy', 'Science Fiction', ...",1920.0,Europe,"['Science Fiction', 'Black-and-white', 'Fantas..."


In [31]:
desc_tech = [movie_descriptions[i] for i, sim in enumerate(similarities_tech) if sim > 0.25]
desc_tech

["Computer experts around the world strive towards the development of intelligent robots. Pioneers like Raymond Kurzweil and Hiroshi Ishiguro dream of fashioning intelligent machines that will equal their human creators. In this potential reality, man and machine merge as a single unity. Rejecting evolution's biological shackles tantalisingly dangles the promise of eternal life for those bold enough to seize it. But others, like Joseph Weizenbaum, counter attack against society's limitless faith in the redemptive powers of technology. Eloquent and tactful, he questions the prevailing discourses on new technologies, and their ethical relationships to human life. The film delves into a world where computer technology, robotics, biology, neuroscience, and developmental psychology merge and features the world’s leading roboticists in their laboratories in Japan, the USA, Italy and Germany.",
 "Coinciding with the worst drought ever, much of the world's water is polluted. The evil Botijola 

In [32]:
df_movies_tech.loc[:, 'plot_summary'] = df_movies_tech['Wikipedia_movie_ID'].map(movies_dict)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_tech.loc[:, 'plot_summary'] = df_movies_tech['Wikipedia_movie_ID'].map(movies_dict)


In [33]:
df_movies_tech['plot_summary']

229      Computer experts around the world strive towar...
2364     Desk Set takes place at the "Federal Broadcast...
7194     Bharathan is a post-graduate and unemployed. S...
9496     More tells the story of an inventor who lives ...
15159    'Sleep Dealer' is set in a future, militarized...
15925    An inventor is working on his latest creation,...
20142    The Us Now website describes the project as an...
20584    Bioscope, set in the early years of the twenti...
22370    A scientist invents a television called the Ic...
23691     The story follows the life of Robert Herne, w...
25433    The film takes place at an undetermined point ...
26480    A scientific genius has invented a machine cap...
26947    Venkat Ramakrishnan ([[Srikanth , Sevarkodi Se...
29461    The NSA-funded QT  Corporation has slated a pr...
30972    A highly advanced computer witnesses a murder ...
31404    With contributions from over 50 politicians, s...
32353    Teenager Karen Braden  is a troubled mental ho.

In [58]:
output_file = "df_movies/movies_tech.csv"
df_movies_tech.to_csv(output_file, index=False)

In [52]:
query_health = model_miniLM.encode(["Health"])

similarities_health = cosine_similarity(query_health, movie_embeddings).flatten()
ids_health = [movie_ids[i] for i, sim in enumerate(similarities_health) if sim > 0.28]
df_movies_health = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_health)] #| df_movies['Movie_genres'].str.contains('Science Fiction', na=False)]
df_movies_health

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
257,19389755,/m/04mzmq8,Asylum Seekers,2009.0,,90.0,['English Language'],"['United States of America', 'United Kingdom']","['Parody', 'Thriller', 'Horror', 'Indie', 'Mys...",2000.0,North America,"['Thriller', 'Mystery', 'Indie', 'Horror', 'Co..."
1155,35790655,/m/0jt1c7n,Putham Pudhu Payanam,1991.0,,120.0,['Tamil Language'],['India'],['Drama'],1990.0,Asia,['Drama']
3724,4679679,/m/0cgz8c,Una Breve Vacanza,1973.0,,112.0,['Italian Language'],['Italy'],"['Romance Film', 'Drama', 'World cinema']",1970.0,Europe,"['World', 'Drama', 'Romance']"
4567,5472438,/m/0dndpx,The Ballad of Narayama,1983.0,,130.0,['Japanese Language'],['Japan'],"['Japanese Movies', 'Drama', 'Comedy', 'World ...",1980.0,Asia,"['Japanese Movies', 'Drama', 'Art film', 'Come..."
4772,35004748,/m/0j63xrh,Desarrollo humano,2007.0,,29.0,"['French Language', 'English Language']",['Spain'],"['Short Film', 'Documentary']",2000.0,Europe,"['Documentary', 'Short Film']"
...,...,...,...,...,...,...,...,...,...,...,...,...
77475,10962872,/m/02qwfjk,Hattrick,2007.0,,125.0,"['Hindi Language', 'English Language']",['India'],"['World cinema', 'Sports', 'Musical', 'Drama',...",2000.0,Asia,"['Sports', 'Drama', 'Bollywood', 'Comedy', 'Mu..."
78218,2021218,/m/06fph5,Safe,1995.0,512245.0,119.0,['English Language'],"['United States of America', 'United Kingdom']","['Thriller', 'Drama', 'Psychological thriller'...",1990.0,North America,"['Indie', 'Drama', 'Thriller']"
79800,24423022,/m/07scndl,The Crazy World of Julius Vrooder,1974.0,,98.0,['English Language'],['United States of America'],"['Drama', 'Comedy-drama', 'Comedy']",1970.0,North America,"['Comedy', 'Drama']"
79908,32865322,/m/0cft_8t,Phase One,2010.0,,,['English Language'],"['Canada', 'United Kingdom']","['Thriller', 'Horror']",2010.0,North America,"['Horror', 'Thriller']"


In [53]:
desc_health = [movie_descriptions[i] for i, sim in enumerate(similarities_health) if sim > 0.28]
desc_health

['The film concerns Dr. Akagi, a doctor on an island in the Seto Inland Sea area during World War II. He runs into conflict with the military while trying to combat a hepatitis epidemic. Akagi earns the nickname "Dr. Liver"  because of his work.',
 'The wives of several top doctors feel neglected by their husbands, so they turn to drink, drugs and sex for solace.',
 'After a brief history of the pollution imperative from before the Industrial Revolution to the present, the film follows a nuclear family polluting its way through an average day. The narrator explains that pollution is a critical part of our culture and keeps our economy strong. The film connects our wasteful, selfish habits with consumerism and its ally, big business. The resulting out-of-control pollution of the air, water and land is displayed in scenes of dismal destruction overlayed with happy music and cheerful calls by the narrator to pollute more for a better tomorrow.',
 'Set in war-torn Congo and post-conflict L

In [57]:
df_movies_health['plot_summary'] = df_movies_health['Wikipedia_movie_ID'].map(movies_dict)
output_file = "df_movies/movies_health.csv"
df_movies_health.to_csv(output_file, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_health['plot_summary'] = df_movies_health['Wikipedia_movie_ID'].map(movies_dict)


In [55]:
query_econ = model_miniLM.encode(["Economics"])

similarities_econ = cosine_similarity(query_econ, movie_embeddings).flatten()
ids_econ = [movie_ids[i] for i, sim in enumerate(similarities_econ) if sim > 0.27]
df_movies_econ = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_econ)] #| df_movies['Movie_genres'].str.contains('Science Fiction', na=False)]
df_movies_econ

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
376,19168185,/m/04ldc1b,9 Dead Gay Guys,2003.0,26377.0,83.0,['English Language'],['United Kingdom'],"['LGBT', 'Gay', 'Gay Interest', 'Comedy', 'Gay...",2000.0,Europe,"['Gay', 'Comedy', 'LGBT']"
517,7014201,/m/0h09nh,Ernest Goes to School,1994.0,,89.0,['English Language'],['United States of America'],"['Family Film', 'Comedy', 'Slapstick']",1990.0,North America,"['Family', 'Comedy', 'Slapstick']"
1139,30710263,/m/0g9_hjj,Fighter,2011.0,,160.0,[],['India'],['Action'],2010.0,Asia,['Action/Adventure']
1699,15234387,/m/03hmvds,A Flower in Hell,1958.0,,86.0,['Korean Language'],['South Korea'],"['Crime Fiction', 'Drama']",1950.0,Asia,"['Drama', 'Fiction']"
2054,15386228,/m/03m6sts,WΔZ,2008.0,,104.0,['English Language'],"['United States of America', 'United Kingdom']","['Thriller', 'Crime Thriller', 'Psychological ...",2000.0,North America,['Thriller']
...,...,...,...,...,...,...,...,...,...,...,...,...
79948,7452952,/m/04j0pd7,Do Ankhen Barah Haath,1958.0,,124.0,['Hindi Language'],['India'],"['Musical', 'Drama', 'Bollywood', 'World cinema']",1950.0,Asia,"['World', 'Drama', 'Musical', 'Bollywood']"
80193,11188284,/m/02r30td,Pardesi Babu,1998.0,,,['Hindi Language'],['India'],"['Romantic comedy', 'Buddy film', 'Comedy']",1990.0,Asia,"['Buddy', 'Comedy', 'Romance']"
80671,24485169,/m/0808xmp,The President's Mystery,1936.0,,81.0,[],['United States of America'],"['Thriller', 'Mystery', 'Romance Film', 'Drama']",1930.0,North America,"['Drama', 'Thriller', 'Romance', 'Mystery']"
81072,10615587,/m/02qk9k5,Buddha Mil Gaya,1971.0,,138.0,['Hindi Language'],['India'],"['Mystery', 'Romance Film', 'Drama', 'Musical']",1970.0,Asia,"['Musical', 'Drama', 'Romance', 'Mystery']"


In [56]:
desc_econ = [movie_descriptions[i] for i, sim in enumerate(similarities_econ) if sim > 0.28]
desc_econ

["A farmer is busy with hoeing while his son Porky is ploughing the fields with his horse Dobbin. Hank Horsefly speeds up the process. The farmer and Porky are about to take a turn for the worst as Mr. Viper The Snake comes with a Mortgage form ready to evict them unless a sum of rent money is paid. Porky applies for a job as horse driving milkman with a strict condition not to break a single bottle. Porky is doing well until Hank having followed their trail, sends Dobbin going at full speed and crashing, causing all the milk bottles to break. As Porky despairs, Dobbin accidentally enters a horse race. When the race starts, Dobbin isn't getting far, until Hank kick starts Dobbin to overtake every racer and wins a $40,000 prize. Porky makes it to the farm in the nick of time, riding in a roofless limo. Porky pays the owed money to Mr. Viper and Hank gives him a kick.",
 "The story begins with the four losers, Roy, Adi, Boman and Manav who are good-for-nothing. Their tale begins when the

In [59]:
df_movies_econ['plot_summary'] = df_movies_econ['Wikipedia_movie_ID'].map(movies_dict)
output_file = "df_movies/movies_econ.csv"
df_movies_econ.to_csv(output_file, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_econ['plot_summary'] = df_movies_econ['Wikipedia_movie_ID'].map(movies_dict)


In [66]:
query_econ_crisis = model_miniLM.encode(["Economic crisis"])

similarities_econ_crisis = cosine_similarity(query_econ_crisis, movie_embeddings).flatten()
ids_econ_crisis = [movie_ids[i] for i, sim in enumerate(similarities_econ_crisis) if sim > 0.3]
df_movies_econ_crisis = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_econ_crisis)] #| df_movies['Movie_genres'].str.contains('Science Fiction', na=False)]
df_movies_econ_crisis

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
199,33956317,/m/0hndjzb,The Catastrophe,,,15.0,[],[],"['Short Film', 'Drama']",,Unknown,"['Drama', 'Short Film']"
470,20713032,/m/052172r,Bhopal: Prayer for Rain,2012.0,,,['English Language'],"['United States of America', 'India', 'United ...","['Disaster', 'Drama']",2010.0,North America,"['Disaster', 'Drama']"
1139,30710263,/m/0g9_hjj,Fighter,2011.0,,160.0,[],['India'],['Action'],2010.0,Asia,['Action/Adventure']
1329,24739648,/m/0806h15,Collapse,2009.0,,82.0,['English Language'],['United States of America'],"['Political cinema', 'Indie', 'Documentary']",2000.0,North America,"['Indie', 'Political', 'Documentary']"
1380,21604407,/m/05msfyg,Aduri,2008.0,,101.0,['English Language'],['United States of America'],"['Thriller', 'Action', 'Drama']",2000.0,North America,"['Action/Adventure', 'Drama', 'Thriller']"
...,...,...,...,...,...,...,...,...,...,...,...,...
80414,2887831,/m/08992s,The Day of the Roses,2001.0,,220.0,['English Language'],['Australia'],"['Television movie', 'Drama', 'Documentary']",2000.0,Oceania,"['Drama', 'Documentary', 'Television']"
80447,5371392,/m/0dhwrq,Varavelpu,1989.0,,145.0,['Malayalam Language'],['India'],"['Black comedy', 'Comedy film', 'Drama']",1980.0,Asia,"['Comedy', 'Drama']"
80599,5082629,/m/0d1xwr,The Great Riviera Bank Robbery,1979.0,,102.0,['English Language'],['United Kingdom'],"['Crime Fiction', 'Crime Thriller', 'Docudrama']",1970.0,Europe,"['Drama', 'Thriller', 'Fiction']"
80671,24485169,/m/0808xmp,The President's Mystery,1936.0,,81.0,[],['United States of America'],"['Thriller', 'Mystery', 'Romance Film', 'Drama']",1930.0,North America,"['Drama', 'Thriller', 'Romance', 'Mystery']"


In [65]:
desc_econ_crisis = [movie_descriptions[i] for i, sim in enumerate(similarities_econ_crisis) if sim > 0.3]
desc_econ_crisis

['The film begins with construction completion of a water canal to the village, set in the present. Radha , as the \'mother\' of the village, is asked to open the canal and remembers back to her past when she was newly married. The wedding between Radha and Shamu  was paid for by Radha\'s mother-in-law who raised a loan from the moneylender, Sukhilala. This event starts the spiral of poverty and hardship which Radha endures. The conditions of the loan are disputed, but the village elders decide in favour of the moneylender, after which Shamu and Radha are forced to pay three quarters of their crop as interest on the loan of 500 rupees. Whilst trying to bring more of their land into use to alleviate their poverty, Shamu\'s arms are crushed by a boulder. He is ashamed of his helplessness and is humiliated by others in the village; deciding that he is no use to his family, he leaves and does not return. Soon after, Radha\'s mother-in-law dies. Radha continues to work in the fields with he

In [67]:
df_movies_econ_crisis['plot_summary'] = df_movies_econ_crisis['Wikipedia_movie_ID'].map(movies_dict)
output_file = "df_movies/movies_econ_crisis.csv"
df_movies_econ_crisis.to_csv(output_file, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_econ_crisis['plot_summary'] = df_movies_econ_crisis['Wikipedia_movie_ID'].map(movies_dict)


In [68]:
query_gender_eq = model_miniLM.encode(["Gender equality"])

similarities_gender_eq = cosine_similarity(query_gender_eq, movie_embeddings).flatten()
ids_gender_eq = [movie_ids[i] for i, sim in enumerate(similarities_gender_eq) if sim > 0.3]
df_movies_gender_eq = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_gender_eq)] #| df_movies['Movie_genres'].str.contains('Science Fiction', na=False)]
df_movies_gender_eq

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
4,261236,/m/01mrr1,A Woman in Flames,1983.0,,106.0,['German Language'],['Germany'],['Drama'],1980.0,Europe,['Drama']
2582,29381935,/m/0fpjwb1,Agniputra,2000.0,,125.0,['Hindi Language'],['India'],['Action'],2000.0,Asia,['Action/Adventure']
3692,17610897,/m/04669t1,The Red Lantern,1919.0,,70.0,['English Language'],['United States of America'],"['Silent film', 'Black-and-white']",1910.0,North America,"['Black-and-white', 'Silent film']"
4206,21020059,/m/05b0cnv,Saknoiya,1959.0,,,['Assamese Language'],['India'],['Drama'],1950.0,Asia,['Drama']
4317,19609512,/m/04m_skx,Aanchal,1980.0,,139.0,['Hindi Language'],['India'],"['Romantic drama', 'Romance Film', 'Drama', 'W...",1980.0,Asia,"['World', 'Drama', 'Romance']"
...,...,...,...,...,...,...,...,...,...,...,...,...
78437,20014545,/m/04ydfx8,Sisters in Law,2005.0,,104.0,[],"['United Kingdom', 'Cameroon']",['Documentary'],2000.0,Europe,['Documentary']
80051,2682566,/m/07xp7z,The Heart of Men,2003.0,,100.0,['French Language'],['France'],"['Romance Film', 'Comedy']",2000.0,Europe,"['Comedy', 'Romance']"
80658,22963595,/m/063ych2,Chaitra,,,20.0,"['Marathi Language', 'English Language']",['India'],['Short Film'],,Asia,['Short Film']
81118,18118679,/m/04y663q,Torch Song Trilogy,1988.0,4865997.0,120.0,['English Language'],['United States of America'],"['Romantic comedy', 'LGBT', 'Comedy-drama', 'D...",1980.0,North America,"['Drama', 'Gay', 'Comedy', 'Romance', 'LGBT']"


In [69]:
desc_gender_eq = [movie_descriptions[i] for i, sim in enumerate(similarities_gender_eq) if sim > 0.3]
desc_gender_eq

['The story follows the main characters Rose a comfortably out woman who identifies as lesbian and Anthony a decent progressive straight man who serendipitously meet and then unexpectedly find themselves falling for each other. Rose has to navigate the reaction of her friends  and her family  while Anthony too has to deal with his friends who are equally nonplussed.',
 "Take Hayden is a hard worker who hates office politics. Hayden's career has been defined by one small but lasting workplace fiasco that occurred years ago. Trying to get back into the company's good graces, Hayden must compete with her best friend Claire for a promotion. Claire is the ultimate competitor, and is determined to make VP by the time she's 30 – but when she unexpectedly finds out she's pregnant, Claire begins to wonder if a woman has to make a choice between career and motherhood. Both women are under the watchful eye of Eve, the ultimate political machine who isn't afraid to ruffle feathers  to establish he

In [70]:
df_movies_gender_eq['plot_summary'] = df_movies_gender_eq['Wikipedia_movie_ID'].map(movies_dict)
output_file = "df_movies/movies_gender_eq.csv"
df_movies_gender_eq.to_csv(output_file, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_gender_eq['plot_summary'] = df_movies_gender_eq['Wikipedia_movie_ID'].map(movies_dict)


In [76]:
query_migr= model_miniLM.encode(["Migration"])

similarities_migr = cosine_similarity(query_migr, movie_embeddings).flatten()
ids_migr = [movie_ids[i] for i, sim in enumerate(similarities_migr) if sim > 0.2]
df_movies_migr = df_movies[df_movies['Wikipedia_movie_ID'].isin(ids_migr)] #| df_movies['Movie_genres'].str.contains('Science Fiction', na=False)]
df_movies_migr

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Decade,Movie_continent,Grouped_genres
522,24732078,/m/0807778,Journey,1972.0,,,['English Language'],['Canada'],['Drama'],1970.0,North America,['Drama']
756,26900805,/m/051_4zf,The Chiltern Hundreds,,,,[],['United Kingdom'],"['Political satire', 'Comedy', 'Domestic Comedy']",,Europe,"['Political', 'Comedy']"
872,34992168,/m/0j65872,Trip,2008.0,,3.0,[],['Germany'],"['Short Film', 'Animation']",2000.0,Europe,"['Short Film', 'Animation']"
998,10604572,/m/02qj_5b,Rebel Intruders,1980.0,,99.0,['Mandarin Chinese'],['Hong Kong'],"['Action', 'Martial Arts Film']",1980.0,Asia,"['Action/Adventure', 'Martial Arts Film']"
1710,16731651,/m/0404gn8,Promise at Dawn,1970.0,,102.0,['English Language'],"['United States of America', 'France']",['Drama'],1970.0,North America,['Drama']
...,...,...,...,...,...,...,...,...,...,...,...,...
80579,22576411,/m/05zy42t,Symptoms,1974.0,,91.0,['English Language'],"['Belgium', 'United Kingdom']",['Horror'],1970.0,Europe,['Horror']
81145,6744353,/m/0glrz1,Gumnaam,1965.0,,143.0,"['Hindi Language', 'Urdu Language']",['India'],"['Thriller', 'Drama', 'World cinema', 'Bollywo...",1960.0,Asia,"['Drama', 'World', 'Thriller', 'Bollywood']"
81357,10914021,/m/02qtwbj,Golden Door,2006.0,,118.0,"['Italian Language', 'English Language']","['France', 'Italy']","['Adventure', 'World cinema', 'Fantasy', 'Dram...",2000.0,Europe,"['Fantasy', 'Action/Adventure', 'Drama', 'Worl..."
81497,385299,/m/04czc1w,Aama,1964.0,,,['Nepali Language'],"['United States of America', 'Nepal']",[],1960.0,North America,[nan]


In [81]:
desc_migr = [movie_descriptions[i] for i, sim in enumerate(similarities_migr) if sim > 0.25]
desc_migr

['The documentary tells the stories of four Mennonites  living in two different communities. The colonies of El Savinal  and El Capulin are settled in the Mexican state of Chihuahua and look like typical German farming communities of the 19th Century. They were founded by Russian Mennonite Europeans, whose long history of migration took them from Russia to Canada and from there to Mexico—in search of a place where they can freely practice their religion and speak their ancient Plautdietsch language within their large families. The four protagonists are longing for their perfect world in balance between tradition and modernity: The colony of El Savinal lives in isolation and rejects any modern technology, whereas the colony of El Capulin begins to accept innovations such as electricity and cars. The more orthodox members of the community will migrate to the Bolivian rainforest.',
 'A Czech-born woman arrives on a Greek island having fled Australia to sort out her problems. She becomes f

In [73]:
df_movies_migr['plot_summary'] = df_movies_migr['Wikipedia_movie_ID'].map(movies_dict)
output_file = "df_movies/movies_migr.csv"
df_movies_migr.to_csv(output_file, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_migr['plot_summary'] = df_movies_migr['Wikipedia_movie_ID'].map(movies_dict)
