## NLP exploration on plot summaries

In [17]:
# Import the needed libarairies
import warnings # to ignore pandas version warning
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib.ticker import MaxNLocator
from scipy.stats import ttest_ind, spearmanr # to implement statistical tests
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans # for actors analysis
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

from itertools import zip_longest # to iterate over many lists at the same time

############# NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim
import pyLDAvis.gensim_models
import vaderSentiment # Vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [4]:
########################### Read datasets from repository's data folder ############################
movie_mtd = pd.read_table('data/movie_metadata_CMU_IMDB.csv', sep=',')
actor_mtd = pd.read_table('data/actor_metadata_CMU.csv', sep=',')
personas_mtd = pd.read_table('data/personas_metadata_CMU.csv', sep=',')

plot_summaries_df = pd.read_csv('data/plot_summaries_CMU.csv', sep = ',', usecols= ['ID', 'Summary'])

In [None]:
# Set a global background theme for all our plots and ignore warnings
sns.set_theme(style="darkgrid")
warnings.filterwarnings("ignore")

### Subdataset creation to faciliates the analysis

In [7]:
# Creating deep copies of original data frame to avoid messing it up
all_movies = movie_mtd.copy() # all movies in the dataset, no matter their production country
us_movies = movie_mtd.copy() 
us_partially_movies = movie_mtd.copy()
all_us_movies = movie_mtd.copy()
rest_world_non_US = movie_mtd.copy()

## Filtering the newly created dataframes for the analysis of nox office revenue, IMDb ratings and runtimes
# Movies strictly produced in the US
us_movies = us_movies[us_movies['countries'] == 'United States of America']
# Movies partially produced in the US
us_partially_movies = us_partially_movies[(us_partially_movies['countries'].str.contains('United States of America') == True) & (us_partially_movies['countries'] != 'United States of America')]
# Contains the movies strictly produced in the US and the ones partially produced in the US
all_us_movies = all_us_movies[(all_us_movies['countries'] == 'United States of America') | (all_us_movies['countries'].str.contains('United States of America') == True)]

# Non-US movies
rest_world_non_US = rest_world_non_US[(rest_world_non_US['countries'].str.contains('United States of America') == False) & (rest_world_non_US['countries'] != 'United States of America')]

In [8]:
us_movies.head()

Unnamed: 0,wiki_movie_ID,freebase_movie_ID,title,release_date,box_office,runtime,languages,countries,genres_CMU,release_year,genres_IMDB,averageRating,numVotes
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,English Language,United States of America,"Thriller, Science Fiction, Horror, Adventure, Supernatural, Action, Space western",2001.0,"Action,Horror,Sci-Fi",4.9,58894.0
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenet Ramsey Mystery,2000-02-16,,95.0,English Language,United States of America,"Mystery, Biographical film, Drama, Crime Drama",2000.0,,,
5,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"Silent film, English Language",United States of America,"Short Film, Silent film, Indie, Black-and-white, Comedy",1913.0,"Comedy,Short",7.2,19.0
7,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,English Language,United States of America,"Musical, Comedy, Black-and-white",1938.0,"Drama,Music,Musical",6.8,2357.0
10,175026,/m/017n1p,Sarah and Son,1930,,86.0,English Language,United States of America,"Drama, Black-and-white",1930.0,"Drama,Romance",5.4,356.0


### Opening of the Plot summaries dataset

In [9]:
# Increase the display width to see more characters per column
pd.set_option('display.max_colwidth', None)  # This removes the column width limit

# Display the first 3 lines of each summary in the 'Summary' column
plot_summaries_df['Preview'] = plot_summaries_df['Summary'].apply(lambda x: ' '.join(x.split()[:50])) # x.split() splits the text into individual words, .join rebuilds the string after
# splitlines() to split into separate lines if the text already contains line breaks

# Show the first few rows with the Preview column
plot_summaries_df[['ID', 'Preview']].head()

# sort the plot summaries according to the Wikipedia movie ID
plot_summaries_df_sorted = plot_summaries_df.sort_values(ascending = True , by= 'ID')
plot_summaries_df_sorted[['ID', 'Preview']].head()

print(f"The size of the plot summaries dataset is {plot_summaries_df.shape}.")

plot_summaries_df.head()

The size of the plot summaries dataset is (42303, 3).


Unnamed: 0,ID,Summary,Preview
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.","Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all."
1,31186339,"The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. Her older sister Katniss volunteers to take her place. Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute. Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. He warns them about the ""Career"" tributes who train intensively at special academies and almost always win. During a TV interview with Caesar Flickerman, Peeta unexpectedly reveals his love for Katniss. She is outraged, believing it to be a ploy to gain audience support, as ""sponsors"" may provide in-Games gifts of food, medicine, and tools. However, she discovers Peeta meant what he said. The televised Games begin with half of the tributes killed in the first few minutes; Katniss barely survives ignoring Haymitch's advice to run away from the melee over the tempting supplies and weapons strewn in front of a structure called the Cornucopia. Peeta forms an uneasy alliance with the four Careers. They later find Katniss and corner her up a tree. Rue, hiding in a nearby tree, draws her attention to a poisonous tracker jacker nest hanging from a branch. Katniss drops it on her sleeping besiegers. They all scatter, except for Glimmer, who is killed by the insects. Hallucinating due to tracker jacker venom, Katniss is warned to run away by Peeta. Rue cares for Katniss for a couple of days until she recovers. Meanwhile, the alliance has gathered all the supplies into a pile. Katniss has Rue draw them off, then destroys the stockpile by setting off the mines planted around it. Furious, Cato kills the boy assigned to guard it. As Katniss runs from the scene, she hears Rue calling her name. She finds Rue trapped and releases her. Marvel, a tribute from District 1, throws a spear at Katniss, but she dodges the spear, causing it to stab Rue in the stomach instead. Katniss shoots him dead with an arrow. She then comforts the dying Rue with a song. Afterward, she gathers and arranges flowers around Rue's body. When this is televised, it sparks a riot in Rue's District 11. President Snow summons Seneca Crane, the Gamemaker, to express his displeasure at the way the Games are turning out. Since Katniss and Peeta have been presented to the public as ""star-crossed lovers"", Haymitch is able to convince Crane to make a rule change to avoid inciting further riots. It is announced that tributes from the same district can win as a pair. Upon hearing this, Katniss searches for Peeta and finds him with an infected sword wound in the leg. She portrays herself as deeply in love with him and gains a sponsor's gift of soup. An announcer proclaims a feast, where the thing each survivor needs most will be provided. Peeta begs her not to risk getting him medicine. Katniss promises not to go, but after he falls asleep, she heads to the feast. Clove ambushes her and pins her down. As Clove gloats, Thresh, the other District 11 tribute, kills Clove after overhearing her tormenting Katniss about killing Rue. He spares Katniss ""just this time...for Rue"". The medicine works, keeping Peeta mobile. Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous. Crane changes the time of day in the arena to late at night and unleashes a pack of hound-like creatures to speed things up. They kill Thresh and force Katniss and Peeta to flee to the roof of the Cornucopia, where they encounter Cato. After a battle, Katniss wounds Cato with an arrow and Peeta hurls him to the creatures below. Katniss shoots Cato to spare him a prolonged death. With Peeta and Katniss apparently victorious, the rule change allowing two winners is suddenly revoked. Peeta tells Katniss to shoot him. Instead, she gives him half of the nightlock. However, before they can commit suicide, they are hastily proclaimed the victors of the 74th Hunger Games. Haymitch warns Katniss that she has made powerful enemies after her display of defiance. She and Peeta return to District 12, while Crane is locked in a room with a bowl of nightlock berries, and President Snow considers the situation.","The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in"
2,20663735,"Poovalli Induchoodan is sentenced for six years prison life for murdering his classmate. Induchoodan, the only son of Justice Maranchery Karunakara Menon was framed in the case by Manapally Madhavan Nambiar and his crony DYSP Sankaranarayanan to take revenge on idealist judge Menon who had earlier given jail sentence to Manapally in a corruption case. Induchoodan, who had achieved top rank in Indian Civil Service loses the post and Manapally Sudheeran ([[Saikumar enters the list of civil service trainees. We learn in flashback that it was Ramakrishnan the son of Moopil Nair , who had actually killed his classmate. Six years passes by and Manapally Madhavan Nambiar, now a former state minister, is dead and Induchoodan, who is all rage at the gross injustice meted out to him - thus destroying his promising life, is released from prison. Induchoodan thwarts Manapally Pavithran from performing the funeral rituals of Nambiar at Bharathapuzha. Many confrontations between Induchoodan and Manapally's henchmen follow. Induchoodan also falls in love with Anuradha ([[Aishwarya , the strong-willed and independent-minded daughter of Mooppil Nair. Justice Menon and his wife returns back to Kerala to stay with Induchoodan. There is an appearance of a girl named Indulekha ([[Kanaka , who claims to be the daughter of Justice Menon. Menon flatly refuses the claim and banishes her. Forced by circumstances and at the instigation and help of Manapally Pavithran, she reluctantly come out open with the claim. Induchoodan at first thrashes the protesters. But upon knowing the truth from Chandrabhanu his uncle, he accepts the task of her protection in the capacity as elder brother. Induchoodan decides to marry off Indulekha to his good friend Jayakrishnan . Induchoodan has a confrontation with his father and prods him to accept mistake and acknowledge the parentage of Indulekha. Menon ultimately regrets and goes on to confess to his daughter. The very next day, when Induchoodan returns to Poovally, Indulekha is found dead and Menon is accused of murdering her. The whole act was planned by Pavithran, who after killing Indulekha, forces Raman Nair to testify against Menon in court. In court, Nandagopal Maarar , a close friend of Induchoodan and a famous supreme court lawyer, appears for Menon and manages to lay bare the murder plot and hidden intentions of other party . Menon is judged innocent of the crime by court. After confronting Pavithran and promising just retribution to the crime of killing Indulekha, Induchoodan returns to his father, who now shows remorse for all his actions including not believing in the innocence of his son. But while speaking to Induchoodan, Menon suffers a heart stroke and passes away. At Menon's funeral, Manapally Pavithran arrives to poke fun at Induchoodan and he also tries to carry out the postponed last rituals of his own father. Induchoodan interrupts the ritual and avenges for the death of his sister and father by severely injuring Pavithran. On his way back to peaceful life, Induchoodan accepts Anuradha as his life partner.","Poovalli Induchoodan is sentenced for six years prison life for murdering his classmate. Induchoodan, the only son of Justice Maranchery Karunakara Menon was framed in the case by Manapally Madhavan Nambiar and his crony DYSP Sankaranarayanan to take revenge on idealist judge Menon who had earlier given jail sentence to"
3,2231378,"The Lemon Drop Kid , a New York City swindler, is illegally touting horses at a Florida racetrack. After several successful hustles, the Kid comes across a beautiful, but gullible, woman intending to bet a lot of money. The Kid convinces her to switch her bet, employing a prefabricated con. Unfortunately for the Kid, the woman ""belongs"" to notorious gangster Moose Moran , as does the money. The Kid's choice finishes dead last and a furious Moran demands the Kid provide him with $10,000 by Christmas Eve, or the Kid ""won't make it to New Year's."" The Kid decides to return to New York to try to come up with the money. He first tries his on-again, off-again girlfriend Brainy Baxter . However, when talk of long-term commitment arises, the Kid quickly makes an escape. He next visits local crime boss ""Oxford"" Charley , with whom he has had past dealings. This falls through as Charley is in serious tax trouble and does not particularly care for the Kid anyway. As he leaves Charley's establishment and is about to give up hope, the Kid notices a cornerside Santa Claus and his kettle. Thinking quickly, the Kid fashions himself a Santa suit and begins collecting donations. This fails as he is recognized by a passing policeman, who remembers his previous underhanded activity well. The Kid lands in court, where he is convicted of collecting for a charity without a license and sentenced to ten days in jail . However, while in court, the Kid learns where his scheme went wrong. After a short stay, Brainy arrives to bail him out. He then sets about restarting his Santa operation, this time with legitimate backing. To this end, he needs a charity to represent and a city license. The kid receives key inspiration when he remembers that Nellie Thursday , a kindly neighborhood resident, has been denied entry to a retirement home because of her jailed husband's criminal past as a safecracker. Organizing other small-time New York swindlers and Brainy, who is both surprised and charmed at the Kid's apparent goodwill, the Kid converts an abandoned casino into the ""Nellie Thursday Home For Old Dolls"". A small group of elderly women and makeshift amenities complete the project. The Kid is able to receive the all-important city license. Now free to collect, the Kid and his compatriots dress as Santa Claus and position themselves throughout Manhattan. The others are unaware that the Kid plans to keep the money for himself to pay off Moran. The scheme is a huge success, netting $2,000 in only a few days. An overjoyed Brainy decides to leave her job as a dancer and look after the ""home"" full-time until after Christmas. Coincidentally, her employer is none other than ""Oxford"" Charley, whom Brainy cheerfully informs of the effort. Seeing a potential gold mine, Charley decides to muscle in on the operation. Reasoning that the Nellie Thursday home is ""wherever Nellie Thursday is"", Charley and his crew kidnap the home's inhabitants and move them to Charley's mansion in Nyack. The Kid learns of this when he returns to the home after a late night to find the home deserted and money gone. Clued in by oversized Oxford footprints in the snow, the Kid and his friends pay Charley a visit. Here, Charley reveals the true nature of the Kid's scheme through a phone conversation with Moose Moran. The Kid's accomplices are angry and move to confront him, but the Kid manages to slip away. However, Brainy tracks him down outside and voices her disgust at his actions. After a few days of stewing in self-pity , the Kid is surprised to meet Nellie, who has escaped Charley's compound. He decides to recover the money, sneaking into Charley's home in the guise of an elderly woman. He finds that Charley and his crew are again moving the women, this time to a more secure location. Using the heightened activity to his advantage, the Kid enters Charley's office and confronts him. After a brief struggle, the Kid overpowers Charley and makes off with the money, narrowly avoiding the thugs Charley has sent after him. The ensuing chaos allows Brainy and the others to escape. Later that night, the Kid returns to the original Nellie Thursday home to meet with Moose Moran . The deal appears to be in jeopardy as Moran arrives with Charley. Charley demands that the Kid reimburse him, which would leave too little for Moran. However, the Kid turns the tables by hitting a switch, revealing hidden casino tables. All are occupied, mainly by the escaped old dolls. The Kid and his still-loyal friends hold off the gangsters as the police initiate a raid. Moran and Charley are arrested while the judge who sentenced the Kid earlier warns that he will be ""keeping an eye on him"". The Kid assures him that will not be necessary and his attention will lie on the home, which is going to become a reality. The night's main event begins as Nellie's husband Henry, free on parole, joyously reunites with his wife.","The Lemon Drop Kid , a New York City swindler, is illegally touting horses at a Florida racetrack. After several successful hustles, the Kid comes across a beautiful, but gullible, woman intending to bet a lot of money. The Kid convinces her to switch her bet, employing a prefabricated con."
4,595909,"Seventh-day Adventist Church pastor Michael Chamberlain, his wife Lindy, their two sons, and their nine-week-old daughter Azaria are on a camping holiday in the Outback. With the baby sleeping in their tent, the family is enjoying a barbecue with their fellow campers when a cry is heard. Lindy returns to the tent to check on Azaria and is certain she sees a dingo with something in its mouth running off as she approaches. When she discovers the infant is missing, everyone joins forces to search for her, without success. It is assumed what Lindy saw was the animal carrying off the child, and a subsequent inquest rules her account of events is true. The tide of public opinion soon turns against the Chamberlains. For many, Lindy seems too stoic, too cold-hearted, and too accepting of the disaster that has befallen her. Gossip about her begins to swell and soon is accepted as statements of fact. The couple's beliefs are not widely practised in the country, and when the media report a rumour that the name Azaria means ""sacrifice in the wilderness"" , the public is quick to believe they decapitated their baby with a pair of scissors as part of a bizarre religious rite. Law-enforcement officials find new witnesses, forensics experts, and a lot of circumstantial evidence—including a small wooden coffin Michael uses as a receptacle for his parishioners' packs of un-smoked cigarettes—and reopen the investigation, and eventually Lindy is charged with murder. Seven months pregnant, she ignores her attorneys' advice to play on the jury's sympathy and appears emotionless on the stand, convincing onlookers she is guilty of the crime of which she is accused. As the trial progresses, Michael's faith in his religion and his belief in his wife disintegrate, and he stumbles through his testimony, suggesting he is concealing the truth. In October 1982, Lindy is found guilty and sentenced to life imprisonment with hard labour, while Michael is found guilty as an accessory and given an 18-month suspended sentence. More than three years later, while searching for the body of an English tourist who fell from Uluru, police discover a small item of clothing that is identified as the jacket Lindy had insisted Azaria was wearing over her jumpsuit, which had been recovered early in the investigation. She is immediately released from prison, the case reopened and all convictions against the Chamberlains overturned.","Seventh-day Adventist Church pastor Michael Chamberlain, his wife Lindy, their two sons, and their nine-week-old daughter Azaria are on a camping holiday in the Outback. With the baby sleeping in their tent, the family is enjoying a barbecue with their fellow campers when a cry is heard. Lindy returns to"


### Merging the plot summaries dataset with the movie_mmtd dataset in order to distinguish between US-only. partially US and non US produced movies

In [16]:
# Performing inner joins to be sure each movie kept after the merge has a plot summaries and filtering to keep only useful columnbs for the rest of the analysis
plot_summaries_all_movies = pd.merge(plot_summaries_df, all_movies, left_on= 'ID', right_on= 'wiki_movie_ID', how = 'inner').filter(items=['ID', 'Summary','title', 'box_office', 'language', 'genres_IMDB', 'release_year', 'averageRating', 'numVotes'], axis = 1)
#.drop(columns= ['Preview', 'wiki_movie_ID', 'freebase_movie_ID', 'release_date', 'runtime', 'countries', 'genres_CMU'], inplace= True)

plot_summaries_us_movies = pd.merge(plot_summaries_df, us_movies, left_on= 'ID', right_on= 'wiki_movie_ID', how = 'inner').filter(items=['ID', 'Summary', 'title', 'box_office', 'language', 'genres_IMDB', 'release_year', 'averageRating', 'numVotes'], axis = 1)

plot_summaries_us_partially_movies = pd.merge(plot_summaries_df, us_partially_movies, left_on= 'ID', right_on= 'wiki_movie_ID', how = 'inner').filter(items=['ID', 'Summary', 'title', 'box_office', 'language', 'genres_IMDB', 'release_year', 'averageRating', 'numVotes'], axis = 1)

plot_summaries_RoW_movies = pd.merge(plot_summaries_df, rest_world_non_US, left_on= 'ID', right_on= 'wiki_movie_ID', how = 'inner').filter(items=['ID', 'Summary', 'title', 'box_office', 'language', 'genres_IMDB', 'release_year', 'averageRating', 'numVotes'], axis = 1)

print(f"The size of the plot summaries dataframe merged with the dataset containing all the movies metadata is {plot_summaries_all_movies.shape}.")
print(f"The size of the plot summaries dataframe merged with the dataset containing US-only produced movies is {plot_summaries_us_movies.shape}.")
print(f"The size of the plot summaries dataframe merged with the dataset containing all the partially US-produced movies is {plot_summaries_us_partially_movies.shape}.")
print(f"The size of the plot summaries dataframe merged with the dataset containing all Non US-produced movies is {plot_summaries_RoW_movies.shape}.")

plot_summaries_us_movies.head()

The size of the plot summaries dataframe merged with the dataset containing all the movies metadata is (42208, 8).
The size of the plot summaries dataframe merged with the dataset containing US-only produced movies is (17776, 8).
The size of the plot summaries dataframe merged with the dataset containing all the partially US-produced movies is (3015, 8).
The size of the plot summaries dataframe merged with the dataset containing all Non US-produced movies is (18105, 8).


Unnamed: 0,ID,Summary,title,box_office,genres_IMDB,release_year,averageRating,numVotes
0,31186339,"The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. Her older sister Katniss volunteers to take her place. Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute. Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. He warns them about the ""Career"" tributes who train intensively at special academies and almost always win. During a TV interview with Caesar Flickerman, Peeta unexpectedly reveals his love for Katniss. She is outraged, believing it to be a ploy to gain audience support, as ""sponsors"" may provide in-Games gifts of food, medicine, and tools. However, she discovers Peeta meant what he said. The televised Games begin with half of the tributes killed in the first few minutes; Katniss barely survives ignoring Haymitch's advice to run away from the melee over the tempting supplies and weapons strewn in front of a structure called the Cornucopia. Peeta forms an uneasy alliance with the four Careers. They later find Katniss and corner her up a tree. Rue, hiding in a nearby tree, draws her attention to a poisonous tracker jacker nest hanging from a branch. Katniss drops it on her sleeping besiegers. They all scatter, except for Glimmer, who is killed by the insects. Hallucinating due to tracker jacker venom, Katniss is warned to run away by Peeta. Rue cares for Katniss for a couple of days until she recovers. Meanwhile, the alliance has gathered all the supplies into a pile. Katniss has Rue draw them off, then destroys the stockpile by setting off the mines planted around it. Furious, Cato kills the boy assigned to guard it. As Katniss runs from the scene, she hears Rue calling her name. She finds Rue trapped and releases her. Marvel, a tribute from District 1, throws a spear at Katniss, but she dodges the spear, causing it to stab Rue in the stomach instead. Katniss shoots him dead with an arrow. She then comforts the dying Rue with a song. Afterward, she gathers and arranges flowers around Rue's body. When this is televised, it sparks a riot in Rue's District 11. President Snow summons Seneca Crane, the Gamemaker, to express his displeasure at the way the Games are turning out. Since Katniss and Peeta have been presented to the public as ""star-crossed lovers"", Haymitch is able to convince Crane to make a rule change to avoid inciting further riots. It is announced that tributes from the same district can win as a pair. Upon hearing this, Katniss searches for Peeta and finds him with an infected sword wound in the leg. She portrays herself as deeply in love with him and gains a sponsor's gift of soup. An announcer proclaims a feast, where the thing each survivor needs most will be provided. Peeta begs her not to risk getting him medicine. Katniss promises not to go, but after he falls asleep, she heads to the feast. Clove ambushes her and pins her down. As Clove gloats, Thresh, the other District 11 tribute, kills Clove after overhearing her tormenting Katniss about killing Rue. He spares Katniss ""just this time...for Rue"". The medicine works, keeping Peeta mobile. Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous. Crane changes the time of day in the arena to late at night and unleashes a pack of hound-like creatures to speed things up. They kill Thresh and force Katniss and Peeta to flee to the roof of the Cornucopia, where they encounter Cato. After a battle, Katniss wounds Cato with an arrow and Peeta hurls him to the creatures below. Katniss shoots Cato to spare him a prolonged death. With Peeta and Katniss apparently victorious, the rule change allowing two winners is suddenly revoked. Peeta tells Katniss to shoot him. Instead, she gives him half of the nightlock. However, before they can commit suicide, they are hastily proclaimed the victors of the 74th Hunger Games. Haymitch warns Katniss that she has made powerful enemies after her display of defiance. She and Peeta return to District 12, while Crane is locked in a room with a bowl of nightlock berries, and President Snow considers the situation.",The Hunger Games,686533290.0,"Action,Adventure,Sci-Fi",2012.0,7.2,1022897.0
1,2231378,"The Lemon Drop Kid , a New York City swindler, is illegally touting horses at a Florida racetrack. After several successful hustles, the Kid comes across a beautiful, but gullible, woman intending to bet a lot of money. The Kid convinces her to switch her bet, employing a prefabricated con. Unfortunately for the Kid, the woman ""belongs"" to notorious gangster Moose Moran , as does the money. The Kid's choice finishes dead last and a furious Moran demands the Kid provide him with $10,000 by Christmas Eve, or the Kid ""won't make it to New Year's."" The Kid decides to return to New York to try to come up with the money. He first tries his on-again, off-again girlfriend Brainy Baxter . However, when talk of long-term commitment arises, the Kid quickly makes an escape. He next visits local crime boss ""Oxford"" Charley , with whom he has had past dealings. This falls through as Charley is in serious tax trouble and does not particularly care for the Kid anyway. As he leaves Charley's establishment and is about to give up hope, the Kid notices a cornerside Santa Claus and his kettle. Thinking quickly, the Kid fashions himself a Santa suit and begins collecting donations. This fails as he is recognized by a passing policeman, who remembers his previous underhanded activity well. The Kid lands in court, where he is convicted of collecting for a charity without a license and sentenced to ten days in jail . However, while in court, the Kid learns where his scheme went wrong. After a short stay, Brainy arrives to bail him out. He then sets about restarting his Santa operation, this time with legitimate backing. To this end, he needs a charity to represent and a city license. The kid receives key inspiration when he remembers that Nellie Thursday , a kindly neighborhood resident, has been denied entry to a retirement home because of her jailed husband's criminal past as a safecracker. Organizing other small-time New York swindlers and Brainy, who is both surprised and charmed at the Kid's apparent goodwill, the Kid converts an abandoned casino into the ""Nellie Thursday Home For Old Dolls"". A small group of elderly women and makeshift amenities complete the project. The Kid is able to receive the all-important city license. Now free to collect, the Kid and his compatriots dress as Santa Claus and position themselves throughout Manhattan. The others are unaware that the Kid plans to keep the money for himself to pay off Moran. The scheme is a huge success, netting $2,000 in only a few days. An overjoyed Brainy decides to leave her job as a dancer and look after the ""home"" full-time until after Christmas. Coincidentally, her employer is none other than ""Oxford"" Charley, whom Brainy cheerfully informs of the effort. Seeing a potential gold mine, Charley decides to muscle in on the operation. Reasoning that the Nellie Thursday home is ""wherever Nellie Thursday is"", Charley and his crew kidnap the home's inhabitants and move them to Charley's mansion in Nyack. The Kid learns of this when he returns to the home after a late night to find the home deserted and money gone. Clued in by oversized Oxford footprints in the snow, the Kid and his friends pay Charley a visit. Here, Charley reveals the true nature of the Kid's scheme through a phone conversation with Moose Moran. The Kid's accomplices are angry and move to confront him, but the Kid manages to slip away. However, Brainy tracks him down outside and voices her disgust at his actions. After a few days of stewing in self-pity , the Kid is surprised to meet Nellie, who has escaped Charley's compound. He decides to recover the money, sneaking into Charley's home in the guise of an elderly woman. He finds that Charley and his crew are again moving the women, this time to a more secure location. Using the heightened activity to his advantage, the Kid enters Charley's office and confronts him. After a brief struggle, the Kid overpowers Charley and makes off with the money, narrowly avoiding the thugs Charley has sent after him. The ensuing chaos allows Brainy and the others to escape. Later that night, the Kid returns to the original Nellie Thursday home to meet with Moose Moran . The deal appears to be in jeopardy as Moran arrives with Charley. Charley demands that the Kid reimburse him, which would leave too little for Moran. However, the Kid turns the tables by hitting a switch, revealing hidden casino tables. All are occupied, mainly by the escaped old dolls. The Kid and his still-loyal friends hold off the gangsters as the police initiate a raid. Moran and Charley are arrested while the judge who sentenced the Kid earlier warns that he will be ""keeping an eye on him"". The Kid assures him that will not be necessary and his attention will lie on the home, which is going to become a reality. The night's main event begins as Nellie's husband Henry, free on parole, joyously reunites with his wife.",The Lemon Drop Kid,2300000.0,"Comedy,Crime,Music",1951.0,7.0,2303.0
2,1952976,"{{plot}} The film opens in 1974, as a young girl, Dahlia, stands outside after school in the rain, waiting for her mother. Flash forward to 2005, we see a grown-up Dahlia in the midst of a bitter mediation with ex-husband, Kyle , over custody of their daughter, Cecilia . Kyle wants Cecilia to live closer to his apartment in Jersey City, but Dahlia wants to move to Roosevelt Island, where she has found a good school. Kyle threatens to sue for full custody because he feels the distance is too great. He also claims that Dahlia is ""mentally unstable."" Dahlia and Cecilia see an apartment in a complex on Roosevelt Island, which is just a few blocks from Cecilia's new school. The superintendent of the dilapidated building is Mr. Veeck . The manager is Mr. Murray . During the tour, Cecilia sneaks to the roof where she finds a Hello Kitty backpack near a large water tank. They leave the bag with Veeck, and Murray promises Cecilia that she can have it if no one claims it. Cecilia, who had disliked the apartment, now wants desperately to live there. Dahlia agrees to move in. Shortly after, the bedroom ceiling begins to leak dark water. The source is the apartment above, 10F, where the Rimsky family lived up until a month ago. Dahlia enters 10F and finds it flooded, with dark water flowing from every faucet, the walls and toilet. She finds a family portrait of the former tenants—a mother, father, and a girl Cecilia's age. Dahlia complains to both Veeck and Murray about the water, but the former does little about it despite the insistence of the latter. Dahlia soon has dreams of a little girl who appears to be Cecilia returning from a visit to her father's home, but the girl's appearance changes every time Dahlia looks away, so that she looks like the girl in the portrait in 10F. Cecilia has started school, but according to her new teacher , she isn't fitting in and is spending too much time with an imaginary friend, named Natasha. A psychologist is recommended, but Dahlia declines and tells Cecilia to ignore Natasha. Although Veeck had said it was claimed, Dahlia discovers the Hello Kitty backpack in the laundry room's garbage. Cecilia later finds it in the elevator. The name in the backpack reads ""Natasha Rimsky"". The ceiling, shoddily patched by Veeck, leaks again. At school, Cecilia appears to get into a fight with Natasha, who appears to control her hand while painting. She's taken to the girls' bathroom where she passes out after dark water gushes from the toilets and sinks. Dahlia, who is meeting with her lawyer, can't be reached, so Kyle picks her up and takes her to his apartment. Dahlia breaks down when she can't find her daughter and has strange dreams. These lead her to the roof and up the ladder of the water tank, where she finds Natasha's body. When police arrive, they discover that Natasha's father thought she was with her mother, while the mother thought the girl was with her father. The girl was left alone in the abandoned apartment and fell into the water tank, which Veeck had left open. He was aware of her body, which was why he refused to fix the water problem plaguing the complex. Veeck is arrested and Murray is questioned. Dahlia agrees to move closer to Kyle so shared custody will go easier. As Dahlia packs, Cecilia is taking a bath. A girl in a hooded bathrobe comes out of the bathroom, wanting Dahlia to read to her. When she hears voices in the bathroom, she realizes that the girl is Natasha. Natasha begs Dahlia not to leave her, but Dahlia rushes into the bathroom to save Cecilia. Natasha then locks Cecilia in the shower compartment and holds her underwater. Dahlia pleads with Natasha, promising to be her mother forever. Natasha lets Cecilia go and floods the apartment, causing Dahlia to die from drowning. Her and Natasha's spirits are shown walking down the hallway. Kyle picks up Cecilia from the police station. Weeks later, the two go back to pick up the rest of her stuff. Cecilia has a flashback of her and her mother looking at pictures together, and in the elevator, her mother's ghost braids her hair and comforts her—telling her she will always be there. Kyle, momentarily horrified with a malfunction in the elevator, the weird behavior of his daughter, and perhaps noticing her hair had been braided, finally takes her to his apartment in Jersey City.",Dark Water,49483352.0,"Drama,Horror,Mystery",2005.0,5.6,66164.0
3,24225279,"The story begins with Hannah, a young Jewish teen, as she is completing her senior year of high school. Her small neighborhood in Brooklyn is falling apart and SING! is one of the only traditions keeping the neighborhood alive. Newly arrived teacher, Miss Lombardo grew up in the neighborhood but returned to be their Sing Leader. One cold Christmas night, Miss Lombardo is leaving a neighborhood party when a young man hails her a cab, then attempts to mug her. In self defense, she bites his hand to release his grip and he screams in pain and terror, quickly making an exit. The cab driver jokes about not starting the meter yet. On the first day of school, Miss Lombardo runs into difficulty when her students are uninterested and misbehaved. One such student was named Dominic who gets scolded for bringing stolen watches to school grounds and putting his feet up on the desk. On the day of Sing Leader elections, Miss Lombardo recognizes Dominic as her mugger by the bandage on his hand and decides to blackmail him into being co-Sing Leader of the Senior class along with Hannah, who was rightfully elected. The school kids work hard to plan their Sing productions. Hannah and Dominic clash along the way as Hannah uses traditional Sing planning strategies while Dominic wants to introduce the flavor of the youth in the neighborhood. In order to put Dominic and Hannah on the same page, Miss Lombardo suggests that Hannah accompany Dominic to a local club. At first, the two are equally hesitant but Hannah agrees on the terms that it is not a date. However, by the end of the night, Hannah uses Dominic to make her ex boyfriend, Mickey, jealous, and due to this, Hannah and Dominic start seeing each other in a different light. Dominic accompanies Hannah on her walk home and the two share a romantic kiss. Once the two are finally uniting and getting along, the Dept. of Education informs the school that it will close its doors forever at the end of this semester and therefore, there will not be enough resources for them to complete this year's Sing. This fuels the kids to work even harder on their productions and the neighborhood comes together even more to help finance the show, despite the school authorities' ban. Ironically, just as things are starting to look up, Dominic reluctantly accompanies his brother on a robbery of Hannah's mother's diner, their sole source of income which already was at risk of failure due to the school's upcoming closure. A classmate saw Dominic standing outside the diner at the time of the crime and informed Hannah of what he saw. Devastated, Hannah confronts Dominic and he promises to get the money back for her. He then steals the money back from his brother and returns it to the diner, restoring Hannah's faith in him. The recent events had discouraged Dominic from fulfilling his co-Sing leader duties and he had been skipping out on rehearsals. In a moment of great need as the senior's main performer falls unconscious, Dominic steps in to save the show. He sheds his bad-boy demeanor and exceeds all expectations. The underclassmen and seniors perform to a record-high sold out audience. At the end of the show, Hannah makes a moving speech motivating the community to rejoice and always remember that despite compromising circumstances, they completed a successful Sing and proved their community's worth.",Sing,2260616.0,"Drama,Musical",1989.0,6.7,974.0
4,2462689,"Infuriated at being told to write one final column after being laid off from her newspaper job, Ann Mitchell prints a letter from a fictional unemployed ""John Doe"" threatening suicide on Christmas Eve in protest of society's ills. When the note causes a sensation and the paper's competition suspects a fraud and starts to investigate, the newspaper editor rehires Mitchell who comes up with a scheme of hiding the fictional nature of ""John Doe"" while exploiting the sensation caused by the fake letter to boost the newspaper's sales, for which she demands a bonus equal to 8 months' pay. After reviewing a number of derelicts who have shown up at the paper claiming to have penned the original suicide letter, Mitchell and editor Henry Connell hire John Willoughby , a former baseball player and tramp who is in need of money to repair his injured arm, to play John Doe. Mitchell now starts to pen an article series in Doe's name, elaborating on the letter's ideas of society's disregard of people in need. Willoughby gets $50, a new suit of clothes, and a plush hotel suite with his tramp friend , who launches into an extended diatribe against ""the heelots"", lots of heels who incessantly focus on getting money from others. Willoughby is hired to give radio speeches, guided by Mitchell who is promised $100 a week to write his speeches, paid by the newspaper's publisher, D.B. Norton ([[Edward Arnold . Willoughby turns down a $5,000 bribe to admit the whole thing was a publicity stunt, gives Mitchell's speech, and dashes off to the countryside with ""The Colonel"". They ride the rails, playing the harmonica and ocarina until they show up in Millsville, where John Doe is recognized at a diner. He's brought to City Hall, where he's met by Hanson, who gives a five-minute monologue about how he was inspired to start a local John Doe club. The John Doe philosophy spreads across the country, developing into a broad grassroots movement whose simple slogan is, ""Be a better neighbor"". Far from being an altruistic philanthropist, however, Norton plans to channel the support for Doe into support for his own national political ambitions. As a culmination of this plan, Norton has instructed Mitchell to write a speech for Willoughby in which he announces the foundation of a new political party and endorses Norton as its presidential candidate. When Willoughby, who has come to believe in the John Doe philosophy himself, realizes that he is being used, he tries to expose the plot, but is first stymied in his attempts to talk his own mind to a nationwide radio audience at the rally instead of reading the prepared speech, and then exposed as a fake by Norton, who claims to have been deceived, like everyone else, by the staff of the newspaper. Frustrated by his failure, Willoughby intends to commit suicide by jumping from the roof of the City Hall on Christmas Eve, as indicated in the original John Doe letter. Only the intervention of Mitchell and followers of the John Doe clubs persuades him to renege on his threat to kill himself. At this point in the movie, a reference to Jesus Christ is made, that a historical ""John Doe"" has already died for the sake of humanity. The film ends with Connell turning to Norton and saying, ""There you are, Norton! The people! Try and lick that!""",Meet John Doe,,"Comedy,Drama,Romance",1941.0,7.6,15255.0


### Basics stats on the plot summaries

In [None]:
def word_counter(df, lower_bound=1, upper_bound=20): #setting default lower and upper bounds value
    """
    Function that counts the number of plot summaries with word counts 
    within specified bounds.

    Parameters:
        df (pd.DataFrame): DataFrame containing a column 'Summary'.
        lower_bound (int, optional): Minimum number of words in a plot summary (exclusive).
                                     Use None to apply no lower bound. Default is 1.
        upper_bound (int, optional): Maximum number of words in a plot summary (inclusive).
                                     Default is 20.

    Returns:
        int: The count of summaries within the specified word count range.
    """
    if 'Summary' not in df.columns:
        raise ValueError("The dataframe must have a column named 'Summary'.")

    # Precompute word counts for efficiency
    word_counts = df['Summary'].apply(lambda x: len(str(x).split()))

    if lower_bound is None:
        # Apply only the upper bound
        return word_counts[word_counts >= upper_bound].count()
    else:
        # Apply both lower and upper bounds
        return word_counts[(word_counts > lower_bound) & (word_counts <= upper_bound)].count()


list_name_df = ['All movies', 'US-only produced movies', 'Partially US-produced movies', 'Non US-produced movies']
list_df_plot_summaries = [plot_summaries_all_movies, plot_summaries_us_movies, plot_summaries_us_partially_movies, plot_summaries_RoW_movies]
list_set_number_words = [[1, 20], [20, 100], [100, 300], [300,500], [500, 1000], [1000, 2000], [2000, 3000]]
dict_plot_count = {'All movies': [], 'US-only produced movies': [], 
                   'Partially US-produced movies': [], 'Non US-produced movies': [], 
                   }

# Use of "zip" will make the loop terminates when the shortest iterable is exhausted so one uses zip_longest in order to loop through all the elements of the longest list, here list_set_number_words
# Create dictionary for counts
dict_plot_count = {name: [] for name in list_name_df}
interval_names = [f"{low}-{high}" for low, high in list_set_number_words]

# Process DataFrames and intervals
for df, dico_name in zip(list_df_plot_summaries, list_name_df):
    for interval in list_set_number_words:
        word_count = word_counter(df, lower_bound=interval[0], upper_bound=interval[1])
        dict_plot_count[dico_name].append(word_count)

# Create DataFrame with interval names as index
df_word_count_plot_summaries = pd.DataFrame(dict_plot_count, index=interval_names)

print("The below table shows the number of plot summaries satisfying the number of word range (rows of the df)")
df_word_count_plot_summaries.head()


# threshold = 0.02 * sum(dict_number_words.values())  # Set a 2% threshold
# adjusted_words_numbers = {k: v for k, v in dict_number_words.items() if v >= threshold}
# adjusted_words_numbers['Other'] = sum(v for k, v in dict_number_words.items() if v < threshold)

# # Create the pie chart with the adjusted counts
# fig, ax = plt.subplots(figsize=(10, 6))  # Increase figure size
# ax.pie(
#     adjusted_words_numbers.values(),
#     labels=adjusted_words_numbers.keys(),
#     autopct='%1.1f%%',
#     textprops={'size': 'smaller'},  # Adjust text size
#     startangle=140,  # Start the chart at a different angle for better spacing
#     wedgeprops={'edgecolor': 'white'}  # Add separation between slices
# )

# # Add a title and display the chart
# ax.set_title("Visualization of the numbers of words per plot summary")
# plt.show()


Unnamed: 0,All movies,US-only produced movies,Partially US-produced movies,Non US-produced movies
1-20,397,133,11,180
20-100,12137,4639,531,5662
100-300,14406,5600,702,6905
300-500,5933,2745,450,2392
500-1000,7867,3973,1073,2501


In [None]:
# # Calculate the mean number of words per summary
# mean_number_words_per_summary = plot_summaries_df['word_count'].mean()

# # Count the number of words in each summary using len(x.split()) directly
# plot_summaries_df['word_count'] = plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) # count words in a string by splitting on spaces 8by default)

# number_summaries_bw_1_20 = plot_summaries_df[
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) > 1) & 
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) <= 20)
# ]['Summary'].count()

# number_summaries_bw_20_100 = plot_summaries_df[
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) > 20) & 
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) <= 100)
# ]['Summary'].count()

# number_summaries_bw_100_300 = plot_summaries_df[
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) > 100) & 
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) <= 300)
# ]['Summary'].count()

# number_summaries_bw_300_500 = plot_summaries_df[
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) > 300) & 
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) <= 500)
# ]['Summary'].count()

# number_summaries_bw_500_1000 = plot_summaries_df[
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) > 500) & 
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) <= 1000)
# ]['Summary'].count()

# number_summaries_bw_1000_2000 = plot_summaries_df[
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) > 1000) & 
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) <= 2000)
# ]['Summary'].count()

# number_summaries_bw_2000_3000 = plot_summaries_df[
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) > 2000) & 
#     (plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) <= 3000)
# ]['Summary'].count()

# number_summaries_more_500_words = plot_summaries_df[plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) >= 500]['Summary'].count()
# number_summaries_more_1000_words = plot_summaries_df[plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) >= 1000]['Summary'].count()
# number_summaries_more_3000_words = plot_summaries_df[plot_summaries_df['Summary'].apply(lambda x: len(str(x).split())) > 3000]['Summary'].count()

# dict_number_words = {'# words in ]1,20]': number_summaries_bw_1_20, '# words in ]20,100]': number_summaries_bw_20_100, '# words in ]100,300]': number_summaries_bw_100_300, '# words in ]300,500]': number_summaries_bw_300_500, '# words in ]500,1000]': number_summaries_bw_500_1000, '# words in ]1000,2000]': number_summaries_bw_1000_2000, '# words in ]2000,3000]': number_summaries_bw_2000_3000, '# words > 3000': number_summaries_more_3000_words}

# print(f"Mean number of words per summary: {mean_number_words_per_summary:.2f}")
# print(f"The number of plot summaries with with number of words between 1 and 20:  {number_summaries_bw_1_20}.")
# print(f"The number of plot summaries with with number of words between 20 and 100:  {number_summaries_bw_20_100}.")
# print(f"The number of plot summaries with with number of words between 100 and 300:  {number_summaries_bw_100_300}.")
# print(f"The number of plot summaries with with number of words between 300 and 500:  {number_summaries_bw_300_500}.")
# print(f"The number of plot summaries with with number of words between 500 and 1000:  {number_summaries_bw_500_1000}.")
# print(f"The number of plot summaries with with number of words between 1000 and 2000:  {number_summaries_bw_1000_2000}.")
# print(f"The number of plot summaries with with number of words between 2000 and 3000:  {number_summaries_bw_2000_3000}.")
# print("\n")

# print(f"The number of plot summaries with more than 500 words is {number_summaries_more_500_words}.")
# print(f"The number of plot summaries with more than 1000 words is {number_summaries_more_1000_words}.")
# print(f"The number of plot summaries with more than 3000 words is {number_summaries_more_3000_words}.")