In [None]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import math
from scipy import stats
from sklearn import linear_modelS
from src.utils.analysis_movies_function import extract_us_nonus, get_all_genre,  get_movies_genre_change, plot_movies_genre_change, plot_percentage_movies_genre_all, linear_regression_plot
from src.utils.analysis_plots_function import count_key_words, plot_key_words_occ, percentage_key_words_before_after, plot_key_words_occ_zoomed


ModuleNotFoundError: No module named 'src.utils.analysis_movies_functions'

# Loading the data

All the raw data preprocessing, and basic analysis can be found in the jupyter notebook `src/utils/preprocessing.ipynb`. 
In the preprocessing, we did several things: 
- We removed outliers from the dataset (example: a movie with several years of runtime...)
- We transformed data in a more useful way (example: string date to datetime, dict to list...)
- We added new columns from the existing ones and infered some values based on the majority of the data 
- We matched the cleaned ethnicity ID with another dataset to get meaningful ethnicities 

The raw datasets are in `/src/data` and the cleaned and formated datasets are then saved as pkl files in `/data`

In [None]:
movies = pd.read_pickle('data/movies.pkl')
tvtropes = pd.read_pickle('data/tvtropes.pkl')
plot_summaries = pd.read_pickle('data/plot_summaries.pkl')

### US selection

In [None]:
us_movies,nonus_movies = extract_us_nonus(movies)
genres_unique = get_all_genre(movies)

# 9/11 US

The genres with the highest change are social issues, horror, political cinema

In [None]:
df = get_movies_genre_change(us_movies, genres_unique, -1 , 1997,  2002,2006)

In [None]:
print(df.count_before.describe())
print("median")
print(df.count_before.median())

In [None]:
print(df.count_after.describe())
print("median")
print(df.count_after.median())

We notice here that the distribution of genres is right skewed, with a maximum percentage lower than 10% and a median around 0.03%. We should keep this in mind when we will the percentage and the focus should be more on the change than the absolute value

In [None]:
#to get the genre the most impacted, we will choose a small time range to limit cofounding factors
df = get_movies_genre_change(us_movies, genres_unique, 20, 1997,  2002,2006)
plot_movies_genre_change(df)
t_statistic, p_value = stats.ttest_ind(df.count_before, df.count_after)

print("t-statistic:", t_statistic)
print("p-value:", p_value)

for the 20 genres the most changing, there is a significant difference 

In [None]:
# from these 20 genres we focus on the ones linked to politics and social 
genres = ['Social issues', 'Road-Horror','Political satire', 'Film & Television History','Political cinema', 'Law & Crime',
       'Culture & Society','Zombie Film', 'Dystopia', 'Monster']

In [None]:
df = get_movies_genre_change(us_movies, genres_unique, 10, 1997,  2002,2006)
#genres =['Social issues','Road-Horror','Political cinema','Political satire', 'Law & Crime','Illnesses & Disabilities','Holiday Film','Social problem film','Glamorized Spy Film','Swashbuckler films','Dystopia', 'Documentary']
plot_percentage_movies_genre_all(us_movies,genres,1992, 2010, 2002,5,2)

We notice an upward trend in the social issues genre after 2002. A similar pattern appears in political cinema, where the trend shifts from a decrease to an increase. Additionally, the road horror genre shows a peak after 2002

In [None]:
# linear regression for political cinema
linear_regression_plot(us_movies,1992,2010,2002,"Political cinema")



# Non US movies

In [None]:
df = get_movies_genre_change(nonus_movies, genres_unique, 20, 1997,  2002,2006)
plot_movies_genre_change(df)

In [None]:
df = get_movies_genre_change(nonus_movies, genres_unique, 10, 1997,  2002,2006)
#genres =['Social issues','Road-Horror','Political cinema','Political satire', 'Law & Crime','Illnesses & Disabilities','Holiday Film','Social problem film','Glamorized Spy Film','Swashbuckler films','Dystopia', 'Documentary']
plot_percentage_movies_genre_all(nonus_movies,genres,1992, 2010, 2002,5,2)

We observe that the 20 genres with the highest changes are not completely identical between the U.S. and the rest of the world. Additionally, examining the time series for each genre studied in the U.S., we see that genres like political cinema, for example, show different trends in variation when compared to their counterparts outside the U.S.

In [None]:
# linear regression for political cinema
linear_regression_plot(nonus_movies,1992,2010,2002,"Political cinema")



The impact is more pronounced for non-U.S. countries, with a sharp peak followed by a decrease. This could be due to the significant impact of the 2001 event, but the long-term effect on non-U.S. countries may be less important

# Plot summaries analysis on 9/11

### We will inspect the plot of movies to detect any change in lexical fields w.r.t 9/11.

We first merge the movies and plot summaries datasets.

In [None]:

movies_and_plot = pd.merge(left=movies, right=plot_summaries, on='Wikipedia_movie_ID')
movies_and_plot.head()

Here are the words we are interested in. It can be modified to have more key words.

In [None]:
key_words = ['plane', 'tower', 'twin tower', 'terrorism', 'terrorist', 'hijack', 'islam', 'world trade center']
col_name_of_key_words = ['Count_of_' + '_'.join(word.split(' ')) for word in key_words]

In [None]:
# Count the number of occurences of key words in plot summary for each movie 
df_key_words_occ = count_key_words(movies_and_plots_df=movies_and_plot.copy(), key_words=key_words)
df_key_words_occ

In [None]:
plot_key_words_occ(key_words_occ_df=df_key_words_occ, key_words=key_words)

Now we zoom in our period of interest.

In [None]:
plot_key_words_occ_zoomed(key_words_occ_df=df_key_words_occ, key_words=key_words)

We can see that the word "terrorist" and "tower" were more frequent after 2002, but drops in 2010. We see that the words "terrorism", "islam", "world trade center" and "twin tower" are negligible. There is a peak for the word "hijack" in 2006 and then drops a bit.


Now by percentage !


In [None]:
df_key_words_occ_before_after = percentage_key_words_before_after(df_key_words_occ)
df_key_words_occ_before_after


It is interesting to see that for the word "plane", it is less used after 9/11 than before, even though you would expect to have more.