# Alligators Don't Apologize - Exploratory Data Analysis

In [2]:
import pandas as pd
import re
import json

In [3]:
# Path to the data folders
raw_data_folder = '../data/RAW/'
clean_data_folder = '../data/CLEAN/'

## First Dataset: CMU Movies

In [4]:
# Load the movie metadata
CMU_movie_metadata = pd.read_csv(raw_data_folder+'CMU_Movies_Dataset/movie.metadata.tsv', delimiter='\t', header=None)
CMU_movie_metadata.columns = ['wikipedia_movie_ID', 'freebase_movie_ID', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 'movie_runtime', 'movie_languages', 'movie_countries', 'movie_genres']
CMU_movie_metadata.head(10)

# Load the character metadata
CMU_char_metadata = pd.read_csv(raw_data_folder+'CMU_Movies_Dataset/character.metadata.tsv', sep='\t', header=None)
CMU_char_metadata.columns = ['wikipedia_movie_ID', 'freebase_movie_ID', 'movie_release_date', 'character_name', 'actor_DOB', 'actor_gender', 'actor_height_meters', 'actor_ethnicity', 'actor_name', 'actor_age_at_movie_release', 'freebase_character_actor_map_ID', 'freebase_character_ID', 'freebase_actor_ID']

# Load the plot summaries
CMU_plot_summary = pd.read_csv(raw_data_folder+'CMU_Movies_Dataset/plot_summaries.txt', sep='\t', header=None)
CMU_plot_summary.columns = ['wikipedia_movie_ID', 'plot_summary']

#######

# Include the other parts of the dataset if needed

######


## Second Dataset: GVD Dataset

In [7]:
# Load the GVD dataset
GVD_data = pd.read_csv(raw_data_folder+'GVD_Dataset/2023_gvdDatabase_1_0_country.csv')

In [8]:
GVD_data.head(10)

Unnamed: 0,year,country_code,country_name,country_region,country_subregion,population,indicator,mechanism,count,rate,count_population,source_name,source_url,entry_comment
0,2004,ABW,Aruba,Americas,Caribbean,Female,Homicide intentional,All,0,0.0,51716,UN Office On Drugs and Crime,https://dataunodc.un.org,Data collected and/or validated in the 2009 up...
1,2004,ABW,Aruba,Americas,Caribbean,Male,Homicide intentional,All,2,4.25,47028,UN Office On Drugs and Crime,https://dataunodc.un.org,Data collected and/or validated in the 2009 up...
2,2004,ABW,Aruba,Americas,Caribbean,Total,Homicide intentional,All,2,2.03,98744,UN Office On Drugs and Crime,https://dataunodc.un.org,Data collected and/or validated in the 2009 up...
3,2005,ABW,Aruba,Americas,Caribbean,Female,Homicide intentional,All,1,1.91,52454,UN Office On Drugs and Crime,https://dataunodc.un.org,Data collected and/or validated in the 2010 up...
4,2005,ABW,Aruba,Americas,Caribbean,Male,Homicide intentional,All,5,10.51,47574,UN Office On Drugs and Crime,https://dataunodc.un.org,Data collected and/or validated in the 2010 up...
5,2005,ABW,Aruba,Americas,Caribbean,Total,Homicide intentional,All,6,6.0,100028,UN Office On Drugs and Crime,https://dataunodc.un.org,Data collected and/or validated in the 2010 up...
6,2006,ABW,Aruba,Americas,Caribbean,Female,Homicide intentional,All,1,1.89,52895,UN Office On Drugs and Crime,https://dataunodc.un.org,Data collected and/or validated in the 2011 up...
7,2006,ABW,Aruba,Americas,Caribbean,Male,Homicide intentional,All,4,8.34,47935,UN Office On Drugs and Crime,https://dataunodc.un.org,Data collected and/or validated in the 2011 up...
8,2006,ABW,Aruba,Americas,Caribbean,Total,Homicide intentional,All,5,4.96,100830,UN Office On Drugs and Crime,https://dataunodc.un.org,Data collected and/or validated in the 2011 up...
9,2007,ABW,Aruba,Americas,Caribbean,Female,Homicide intentional,All,0,0.0,53086,UN Office On Drugs and Crime,https://dataunodc.un.org,Data collected and/or validated in the 2012 up...


## Third dataset: Kaggle Movies Dataset

In [9]:
# Load the movies metadata
Kaggle_movies_metadata = pd.read_csv(raw_data_folder+'Kaggle_Movies_Dataset/movies_metadata.csv')

# Load the credits
Kaggle_credits = pd.read_csv(raw_data_folder+'Kaggle_Movies_Dataset/credits.csv')

# Load the keywords data
Kaggle_keywords = pd.read_csv(raw_data_folder+'Kaggle_Movies_Dataset/keywords.csv')

# Load the ratings
Kaggle_ratings = pd.read_csv(raw_data_folder+'Kaggle_Movies_Dataset/ratings.csv')

# Load the links
Kaggle_links = pd.read_csv(raw_data_folder+'Kaggle_Movies_Dataset/links.csv')

  Kaggle_movies_metadata = pd.read_csv(raw_data_folder+'Kaggle_Movies_Dataset/movies_metadata.csv')


In [15]:
# Enable full display of the strings in the dataset
pd.set_option('display.max_colwidth', None)

First, we check the entirety of the columns:

In [10]:
Kaggle_movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

Check specifically all column values for one entry (movie: From Dusk Till Dawn) to inspect the datatypes of each column:

In [11]:
Kaggle_movies_metadata[Kaggle_movies_metadata['id']=='755']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
69,False,"{'id': 10924, 'name': 'From Dusk Till Dawn Col...",19000000,"[{'id': 27, 'name': 'Horror'}, {'id': 28, 'nam...",http://www.miramax.com/movie/from-dusk-till-dawn/,755,tt0116367,en,From Dusk Till Dawn,Seth Gecko and his younger brother Richard are...,...,1996-01-19,25836616.0,108.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,One night is all that stands between them and ...,From Dusk Till Dawn,False,6.9,1644.0


Findings: Some of the columns are a stringified JSON object. This needs to be treated specifically for successful parsing.

Below is a way to first transform the JSON object "genre" into a string and then parsing for a specific word (in this case: Violence).

In [28]:
filtered_Kaggle_metadata = Kaggle_movies_metadata[Kaggle_movies_metadata['genres'].str.contains(r'\bCrime\b', na=False)]
filtered_Kaggle_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
5,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}]",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a top-notch crew on various insane heists throughout Los Angeles while a mentally unstable detective, Vincent Hanna pursues him without rest. Each man recognizes and respects the ability and the dedication of the other even though they are aware their cat-and-mouse game may end in violence.",...,1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'es', 'name': 'Español'}]",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0
15,False,,52000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}]",,524,tt0112641,en,Casino,The life of the gambling paradise – Las Vegas – and its dark mafia underbelly.,...,1995-11-22,116112375.0,178.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,No one stays at the top forever.,Casino,False,7.8,1343.0
17,False,,4000000,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name': 'Comedy'}]",,5,tt0113101,en,Four Rooms,It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another.,...,1995-12-09,4300000.0,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Twelve outrageous guests. Four scandalous requests. And one lone bellhop, in his first day on the job, who's in for the wildest New year's Eve of his life.",Four Rooms,False,6.5,539.0
18,False,"{'id': 3167, 'name': 'Ace Ventura Collection', 'poster_path': '/qCxH543pScFed1CycwJ1nVgrkOc.jpg', 'backdrop_path': '/bswWgdDsLu0fhWMYUzLF8XgiK4h.jpg'}",30000000,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name': 'Comedy'}, {'id': 12, 'name': 'Adventure'}]",,9273,tt0112281,en,Ace Ventura: When Nature Calls,"Summoned from an ashram in Tibet, Ace finds himself on a perilous journey into the jungles of Africa to find Shikaka, the missing sacred animal of the friendly Wachati tribe. He must accomplish this before the wedding of the Wachati's Princess to the prince of the warrior Wachootoos. If Ace fails, the result will be a vicious tribal war.",...,1995-11-10,212385533.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,New animals. New adventures. Same hair.,Ace Ventura: When Nature Calls,False,6.1,1128.0
19,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'name': 'Comedy'}, {'id': 80, 'name': 'Crime'}]",,11517,tt0113845,en,Money Train,"A vengeful New York transit cop decides to steal a trainload of subway fares; his foster brother, a fellow cop, tries to protect him.",...,1995-11-21,35431113.0,103.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Get on, or GET OUT THE WAY!",Money Train,False,5.4,224.0


We now define a list of violent words which we will use to parse through the keywords dataset in the Kaggle dataset:

In [34]:
# Violence keywords
violence_list = ['Murder','Kill','Assault','Abuse','Fight','War','Attack','Violence','Crime','Hostage','Revenge','Torture','Conflict','Terror']

We now use a pattern to parse through the "keywords" dataset and try to find all entries that contain at least one of the keywords in the violence_list.

In [35]:
# Define the pattern for the parsing
pattern = r'\b(?:' + '|'.join(violence_list) + r')\b'

# Filter the keywords dataframe for entries that contain any of the violence_dict
filtered_Kaggle_keywords = Kaggle_keywords[Kaggle_keywords['keywords'].str.contains(pattern, na=False, case=False)]

filtered_Kaggle_keywords.head()

Unnamed: 0,id,keywords
5,949,"[{'id': 642, 'name': 'robbery'}, {'id': 703, 'name': 'detective'}, {'id': 974, 'name': 'bank'}, {'id': 1523, 'name': 'obsession'}, {'id': 3713, 'name': 'chase'}, {'id': 7281, 'name': 'shooting'}, {'id': 9727, 'name': 'thief'}, {'id': 9812, 'name': 'honor'}, {'id': 9826, 'name': 'murder'}, {'id': 9937, 'name': 'suspense'}, {'id': 10051, 'name': 'heist'}, {'id': 10085, 'name': 'betrayal'}, {'id': 10594, 'name': 'money'}, {'id': 10726, 'name': 'gang'}, {'id': 15076, 'name': 'cat and mouse'}, {'id': 18023, 'name': 'criminal mastermind'}, {'id': 34117, 'name': 'cult film'}, {'id': 156121, 'name': 'ex-con'}, {'id': 159343, 'name': 'heist movie'}, {'id': 159434, 'name': 'one last job'}, {'id': 167104, 'name': 'loner'}, {'id': 192261, 'name': 'bank job'}, {'id': 207268, 'name': 'neo-noir'}, {'id': 208009, 'name': 'gun fight'}, {'id': 214983, 'name': 'crime epic'}]"
8,9091,"[{'id': 949, 'name': 'terrorist'}, {'id': 1562, 'name': 'hostage'}, {'id': 1653, 'name': 'explosive'}, {'id': 193533, 'name': 'vice president'}]"
12,21032,"[{'id': 1994, 'name': 'wolf'}, {'id': 6411, 'name': 'dog-sledding race'}, {'id': 9880, 'name': 'alaska'}, {'id': 15162, 'name': 'dog'}, {'id': 15169, 'name': 'goose'}, {'id': 158175, 'name': 'bear attack'}, {'id': 158370, 'name': 'dog sled'}, {'id': 158371, 'name': 'frozen lake'}]"
15,524,"[{'id': 383, 'name': 'poker'}, {'id': 726, 'name': 'drug abuse'}, {'id': 1228, 'name': '1970s'}, {'id': 2635, 'name': 'overdose'}, {'id': 33625, 'name': 'illegal prostitution'}]"
20,8012,"[{'id': 395, 'name': 'gambling'}, {'id': 416, 'name': 'miami'}, {'id': 818, 'name': 'based on novel'}, {'id': 8438, 'name': 'job'}, {'id': 9826, 'name': 'murder'}, {'id': 9935, 'name': 'travel'}, {'id': 10391, 'name': 'mafia'}, {'id': 10594, 'name': 'money'}, {'id': 11061, 'name': 'debt'}, {'id': 11578, 'name': 'mobster'}, {'id': 12094, 'name': 'business'}, {'id': 12396, 'name': 'hollywood'}, {'id': 13142, 'name': 'gangster'}, {'id': 14536, 'name': 'crime'}, {'id': 14819, 'name': 'violence'}, {'id': 14964, 'name': 'drug'}, {'id': 33879, 'name': 'producer'}, {'id': 159608, 'name': 'con'}]"


In [31]:
Kaggle_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
