INSTRUCTIONS

In order for the code to run, download the file in a new folder. 
Download the files from Kaggle: https://www.kaggle.com/rounakbanik/the-movies-dataset
Make sure the Kaggle files are in the same folder as this file for the csv's to import properly 
Save as a new version when any changes are made

In [2]:
#Importing pandas and display techniques

import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt

from ast import literal_eval
import ast
import nltk


In [3]:
movies_metadata = pd.read_csv("./data/movies_metadata.csv", low_memory=False)
keywords = pd.read_csv("./data/keywords.csv", low_memory=False)
cast = pd.read_csv ("./data/credits.csv", low_memory=False)

In [4]:
#Removing columns

movies_metadata = movies_metadata.drop(['adult', 'popularity','belongs_to_collection', 'imdb_id', 'runtime','homepage', 'production_companies', 'original_language','spoken_languages', 'original_title', 'overview', 'poster_path', 'production_countries', 'tagline', 'video' ], axis=1)
cast = cast.drop(['crew'], axis=1)

#Filtering out incomplete data for movies_metadata

movies_metadata = movies_metadata[movies_metadata['budget']!='0']
movies_metadata = movies_metadata[movies_metadata['vote_average']!=0]
movies_metadata = movies_metadata[movies_metadata['revenue']!=0]
movies_metadata = movies_metadata[movies_metadata['status']=='Released']

#Filtering out incomplete data for keywords

keywords = keywords[keywords['keywords']!='[]']

#Filtering out incomplete data for cast
cast = cast[cast['cast']!='[]']

#Re-establishing data types as int

movies_metadata['revenue'] = movies_metadata['revenue'].astype(int)
movies_metadata['id'] = movies_metadata['id'].astype(int)
keywords['id'] = keywords['id'].astype(int)

#Merging the 3 data frames into 1
movies_keywords = keywords.merge(movies_metadata, left_on='id', right_on='id')
movies_keywords_cast = cast.merge(movies_keywords, left_on='id', right_on='id')

# Creating new column for gender
movies_keywords_cast['gender'] = movies_keywords_cast['cast'].copy()

movies_keywords_cast = movies_keywords_cast[['title','id', 'revenue', 'budget', 'vote_average', 'vote_count', 'genres', 'cast','gender', 'keywords', 'release_date']]

# removing duplicates that appeared after merging the data frames 
movies_keywords_cast = movies_keywords_cast.drop_duplicates('id')

movies_keywords_cast.head()


Unnamed: 0,title,id,revenue,budget,vote_average,vote_count,genres,cast,gender,keywords,release_date
0,Toy Story,862,373554033,30000000,7.7,5415.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",1995-10-30
1,Jumanji,8844,262797249,65000000,6.9,2413.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",1995-12-15
2,Waiting to Exhale,31357,81452156,16000000,6.1,34.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'id': 818, 'name': 'based on novel'}, {'id':...",1995-12-22
3,Heat,949,187436818,60000000,7.7,1886.0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'id': 642, 'name': 'robbery'}, {'id': 703, '...",1995-12-15
4,Sudden Death,9091,64350171,35000000,5.5,174.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'cast_id': 1, 'character': 'Darren Francis T...","[{'cast_id': 1, 'character': 'Darren Francis T...","[{'id': 949, 'name': 'terrorist'}, {'id': 1562...",1995-12-22


In [5]:
#cleaning column formatting

movies_keywords_cast['genres'] = movies_keywords_cast['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies_keywords_cast['keywords'] = movies_keywords_cast['keywords'].apply(literal_eval)
movies_keywords_cast['keywords'] = movies_keywords_cast['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies_keywords_cast['cast'] = movies_keywords_cast['cast'].apply(literal_eval)
movies_keywords_cast['cast'] = movies_keywords_cast['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

#Only showing the 3 main actors of a movie
movies_keywords_cast['cast'] = movies_keywords_cast['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

movies_keywords_cast

Unnamed: 0,title,id,revenue,budget,vote_average,vote_count,genres,cast,gender,keywords,release_date
0,Toy Story,862,373554033,30000000,7.7,5415.0,"[Animation, Comedy, Family]","[Tom Hanks, Tim Allen, Don Rickles]","[{'cast_id': 14, 'character': 'Woody (voice)',...","[jealousy, toy, boy, friendship, friends, riva...",1995-10-30
1,Jumanji,8844,262797249,65000000,6.9,2413.0,"[Adventure, Fantasy, Family]","[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[board game, disappearance, based on children'...",1995-12-15
2,Waiting to Exhale,31357,81452156,16000000,6.1,34.0,"[Comedy, Drama, Romance]","[Whitney Houston, Angela Bassett, Loretta Devine]","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[based on novel, interracial relationship, sin...",1995-12-22
3,Heat,949,187436818,60000000,7.7,1886.0,"[Action, Crime, Drama, Thriller]","[Al Pacino, Robert De Niro, Val Kilmer]","[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[robbery, detective, bank, obsession, chase, s...",1995-12-15
4,Sudden Death,9091,64350171,35000000,5.5,174.0,"[Action, Adventure, Thriller]","[Jean-Claude Van Damme, Powers Boothe, Dorian ...","[{'cast_id': 1, 'character': 'Darren Francis T...","[terrorist, hostage, explosive, vice president]",1995-12-22
5,GoldenEye,710,352194034,58000000,6.6,1194.0,"[Adventure, Action, Thriller]","[Pierce Brosnan, Sean Bean, Izabella Scorupco]","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[cuba, falsely accused, secret identity, compu...",1995-11-16
6,The American President,9087,107879496,62000000,6.5,199.0,"[Comedy, Drama, Romance]","[Michael Douglas, Annette Bening, Michael J. Fox]","[{'cast_id': 1, 'character': 'Andrew Shepherd'...","[white house, usa president, new love, widower...",1995-11-17
7,Nixon,10858,13681765,44000000,7.1,72.0,"[History, Drama]","[Anthony Hopkins, Joan Allen, Powers Boothe]","[{'cast_id': 1, 'character': 'Richard Nixon', ...","[usa president, presidential election, waterga...",1995-12-22
8,Cutthroat Island,1408,10017322,98000000,5.7,137.0,"[Action, Adventure]","[Geena Davis, Matthew Modine, Frank Langella]","[{'cast_id': 1, 'character': 'Morgan Adams', '...","[exotic island, treasure, map, ship, scalp, pi...",1995-12-22
9,Casino,524,116112375,52000000,7.8,1343.0,"[Drama, Crime]","[Robert De Niro, Sharon Stone, Joe Pesci]","[{'cast_id': 4, 'character': ""Sam 'Ace' Rothst...","[poker, drug abuse, 1970s, overdose, illegal p...",1995-11-22


In [6]:
#Only showing gender as a numerical value 2=male 1=female 0=undefined
movies_keywords_cast['gender'] = movies_keywords_cast['gender'].apply(literal_eval)
movies_keywords_cast['gender'] = movies_keywords_cast['gender'].apply(lambda x: [i['gender'] for i in x] if isinstance(x, list) else [])
movies_keywords_cast['gender'] = movies_keywords_cast['gender'].apply(lambda x: x[:21] if len(x) >=21 else x)

movies_keywords_cast

Unnamed: 0,title,id,revenue,budget,vote_average,vote_count,genres,cast,gender,keywords,release_date
0,Toy Story,862,373554033,30000000,7.7,5415.0,"[Animation, Comedy, Family]","[Tom Hanks, Tim Allen, Don Rickles]","[2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2]","[jealousy, toy, boy, friendship, friends, riva...",1995-10-30
1,Jumanji,8844,262797249,65000000,6.9,2413.0,"[Adventure, Fantasy, Family]","[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, ...","[board game, disappearance, based on children'...",1995-12-15
2,Waiting to Exhale,31357,81452156,16000000,6.1,34.0,"[Comedy, Drama, Romance]","[Whitney Houston, Angela Bassett, Loretta Devine]","[1, 1, 1, 1, 2, 2, 2, 2, 2, 2]","[based on novel, interracial relationship, sin...",1995-12-22
3,Heat,949,187436818,60000000,7.7,1886.0,"[Action, Crime, Drama, Thriller]","[Al Pacino, Robert De Niro, Val Kilmer]","[2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, ...","[robbery, detective, bank, obsession, chase, s...",1995-12-15
4,Sudden Death,9091,64350171,35000000,5.5,174.0,"[Action, Adventure, Thriller]","[Jean-Claude Van Damme, Powers Boothe, Dorian ...","[2, 2, 2, 2, 2, 1]","[terrorist, hostage, explosive, vice president]",1995-12-22
5,GoldenEye,710,352194034,58000000,6.6,1194.0,"[Adventure, Action, Thriller]","[Pierce Brosnan, Sean Bean, Izabella Scorupco]","[2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, ...","[cuba, falsely accused, secret identity, compu...",1995-11-16
6,The American President,9087,107879496,62000000,6.5,199.0,"[Comedy, Drama, Romance]","[Michael Douglas, Annette Bening, Michael J. Fox]","[2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, ...","[white house, usa president, new love, widower...",1995-11-17
7,Nixon,10858,13681765,44000000,7.1,72.0,"[History, Drama]","[Anthony Hopkins, Joan Allen, Powers Boothe]","[2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, ...","[usa president, presidential election, waterga...",1995-12-22
8,Cutthroat Island,1408,10017322,98000000,5.7,137.0,"[Action, Adventure]","[Geena Davis, Matthew Modine, Frank Langella]","[1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, ...","[exotic island, treasure, map, ship, scalp, pi...",1995-12-22
9,Casino,524,116112375,52000000,7.8,1343.0,"[Drama, Crime]","[Robert De Niro, Sharon Stone, Joe Pesci]","[2, 1, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, ...","[poker, drug abuse, 1970s, overdose, illegal p...",1995-11-22


In [12]:
number_of_males = []
number_of_females = []
number_of_unknown = []
number_of_cast = []
 
#Creating a for loop to go through each cell in the gender column, counting the number of males and females in the cast
for row in movies_keywords_cast['gender']:
    number_of_males.append(row.count(2))
    number_of_females.append(row.count(1))
    number_of_unknown.append(row.count(0))
    number_of_cast.append(len(row))
    
#creating new columns 
movies_keywords_cast['number_of_males'] = number_of_males
movies_keywords_cast['number_of_females'] = number_of_females
movies_keywords_cast['number_of_unknown'] = number_of_unknown
movies_keywords_cast['number_of_cast'] = number_of_cast

#calculating a weighted rating for each film to balance out the big differences in number of votes
vote_counts = movies_keywords_cast[movies_keywords_cast['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies_keywords_cast[movies_keywords_cast['vote_average'].notnull()]['vote_average'].astype('int')

C = vote_averages.mean()
m = vote_counts.quantile(0.75)

#formula for weighted rating 

def weighted_rating(x):
    v = x['vote_count']+1 
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

#adding the weighted_rating column
movies_keywords_cast['weighted_rating'] = movies_keywords_cast.apply(weighted_rating, axis=1)

movies_keywords_cast = movies_keywords_cast[['title','id', 'revenue', 'budget', 'vote_average', 'vote_count', 'weighted_rating', 'genres', 'cast','gender', 'keywords', 'release_date','number_of_males', 'number_of_females', 'number_of_unknown', 'number_of_cast' ]]

#exporting the data as csv file for the visualisation and analysis part
movies_keywords_cast.to_csv('movies_keywords_cast.csv')

In [13]:
#calculating the male to female ratio for each film, but only if there are no undefined actors 
pd.options.mode.chained_assignment = None
movies_no_unknown = movies_keywords_cast[movies_keywords_cast['number_of_unknown']==0]

#Creating new columns to store the percentage male cast / percentage female cast of each film
movies_no_unknown['percentage male'] = movies_no_unknown['number_of_males'] / movies_no_unknown['number_of_cast']

movies_no_unknown['percentage female'] = movies_no_unknown['number_of_females'] / movies_no_unknown['number_of_cast']

#Removing unnecessary columns from the DataFrame
movies_no_unknown = movies_no_unknown.drop(['id', 'vote_average', 'vote_count', 'genres', 'cast', 'number_of_unknown', 'number_of_males', 'number_of_females', 'number_of_cast'], axis=1)

movies_no_unknown = movies_no_unknown[['title', 'revenue', 'budget', 'weighted_rating', 'percentage male', 'percentage female', 'keywords', 'release_date']]

#Setting the index of the DataFrame to be 'title'
movies_no_unknown.set_index('title')

#exporting data for analysis on the impact of gender
movies_no_unknown.to_csv('movies_gender.csv')

In [None]:
#min 10 movies, 20-30 actors, average revenue of their films, 

In [15]:
#creating files with 1 or 2 genres for different analyses 

movies_one_genre = movies_keywords_cast

movies_two_genres = movies_keywords_cast

#removing all genres other than the first one in the one_genre DataFrame
movies_one_genre['genres'] = movies_one_genre['genres'].apply(lambda x: x[:1] if len(x) >=1 else x)

#removing all genres other than the first 2 in the two_genres DataFrame
movies_two_genres['genres'] = movies_two_genres['genres'].apply(lambda x: x[:2] if len(x) >=2 else x)

#Dropping unnecessary columns
movies_one_genre = movies_one_genre.drop(['id', 'vote_average', 'vote_count', 'cast', 'gender', 'number_of_males', 'number_of_females', 'number_of_cast', 'number_of_unknown'], axis=1)

movies_two_genres = movies_two_genres.drop(['id', 'vote_average', 'vote_count', 'cast', 'gender', 'number_of_males', 'number_of_females', 'number_of_cast', 'number_of_unknown'], axis=1)

movies_one_genre.to_csv('movies_one_genre.csv')

movies_two_genres.to_csv('movies_two_genres.csv')

movies_one_genre

Unnamed: 0,title,revenue,budget,weighted_rating,genres,keywords,release_date
0,Toy Story,373554033,30000000,7.447008,[Animation],"[jealousy, toy, boy, friendship, friends, riva...",1995-10-30
1,Jumanji,262797249,65000000,6.626929,[Adventure],"[board game, disappearance, based on children'...",1995-12-15
2,Waiting to Exhale,81452156,16000000,5.875670,[Comedy],"[based on novel, interracial relationship, sin...",1995-12-22
3,Heat,187436818,60000000,7.122823,[Action],"[robbery, detective, bank, obsession, chase, s...",1995-12-15
4,Sudden Death,64350171,35000000,5.805042,[Action],"[terrorist, hostage, explosive, vice president]",1995-12-22
5,GoldenEye,352194034,58000000,6.291636,[Adventure],"[cuba, falsely accused, secret identity, compu...",1995-11-16
6,The American President,107879496,62000000,5.985336,[Comedy],"[white house, usa president, new love, widower...",1995-11-17
7,Nixon,13681765,44000000,5.962398,[History],"[usa president, presidential election, waterga...",1995-12-22
8,Cutthroat Island,10017322,98000000,5.843735,[Action],"[exotic island, treasure, map, ship, scalp, pi...",1995-12-22
9,Casino,116112375,52000000,7.041861,[Drama],"[poker, drug abuse, 1970s, overdose, illegal p...",1995-11-22


In [16]:
#Cleaning unnecessary punctuation from the genres column

movies_one_genre['genres'] = movies_one_genre['genres'].astype(str)

movies_one_genre['genres'] = movies_one_genre['genres'].str.replace(r"[\']", '')

movies_one_genre['genres'] = movies_one_genre['genres'].str.replace(r"[\[\]]", '')

#Creating new DataFrames containing all movies of each genre
action = movies_one_genre[movies_one_genre['genres'].str.contains("Action")]

adventure = movies_one_genre[movies_one_genre['genres'].str.contains("Adventure")]

animation = movies_one_genre[movies_one_genre['genres'].str.contains("Animation")]

comedy = movies_one_genre[movies_one_genre['genres'].str.contains("Comedy")]

crime = movies_one_genre[movies_one_genre['genres'].str.contains("Crime")]

documentary = movies_one_genre[movies_one_genre['genres'].str.contains("Documentary")]

drama = movies_one_genre[movies_one_genre['genres'].str.contains("Drama")]

family = movies_one_genre[movies_one_genre['genres'].str.contains("Family")]

fantasy = movies_one_genre[movies_one_genre['genres'].str.contains("Fantasy")]

history = movies_one_genre[movies_one_genre['genres'].str.contains("History")]

horror = movies_one_genre[movies_one_genre['genres'].str.contains("Horror")]

music = movies_one_genre[movies_one_genre['genres'].str.contains("Music")]

mystery = movies_one_genre[movies_one_genre['genres'].str.contains("Mystery")]

romance = movies_one_genre[movies_one_genre['genres'].str.contains("Romance")]

science_fiction = movies_one_genre[movies_one_genre['genres'].str.contains("Science Fiction")]

thriller = movies_one_genre[movies_one_genre['genres'].str.contains("Thriller")]

war = movies_one_genre[movies_one_genre['genres'].str.contains("War")]

western = movies_one_genre[movies_one_genre['genres'].str.contains("Western")]


In [17]:
#Creating a new DataFrame to store the revenue and amount of films of reach genre

genres = [action, adventure, animation, comedy, crime, documentary, drama, family, fantasy, history, horror, music, mystery, romance, science_fiction, thriller, war, western] 

genre_names = ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'history', 'horror', 'music', 'mystery', 'romance', 'science fiction', 'thriller', 'war', 'western']

#Creating new variables to store the average_revenue and amount of films
average_revenue = []

popularity = []

#A for loop going through each DataFrame and calculating the average revenue for each genre
#Sums up the numbers in the revenue column and divides it by the amount of films in the genre
for x in genres:
    average_revenue.append([x['revenue'].sum() / len(x.index)])

#A for loop storing the amount of films in each genre
for x in genres:
    popularity.append(len(x.index))

#Creating a new DataFrame with the columns genre, average_revenue and popularity
genres_revenue = pd.DataFrame(columns=['genre', 'average_revenue', 'popularity'])

#Initialising the genres with the data stored in the average_revenue and popularity variables
genres_revenue['genre'] = genre_names
genres_revenue['average_revenue'] = average_revenue
genres_revenue['popularity'] = popularity

#Removing the brackets from the values in the average_revenue column
genres_revenue['average_revenue'] = genres_revenue['average_revenue'].str.get(0)

genres_revenue.to_csv('genres_revenue.csv')

genres_revenue

Unnamed: 0,genre,average_revenue,popularity
0,action,129322800.0,909
1,adventure,204070000.0,401
2,animation,252417100.0,139
3,comedy,66725170.0,980
4,crime,49435210.0,263
5,documentary,17848390.0,40
6,drama,55398860.0,1206
7,family,251790600.0,52
8,fantasy,136394200.0,134
9,history,72631360.0,27


In [21]:
#exporting csv of movies with their budget, revenue and weighted rating 
movies_revenue_ratings = movies_keywords_cast.drop(['id', 'vote_average', 'vote_count', 'genres', 'cast', 'keywords', 'gender', 'release_date', 'number_of_males', 'number_of_females', 'number_of_unknown', 'number_of_cast'], axis=1)
movies_revenue_ratings.to_csv('movies_revenue_ratings.csv')

In [22]:
#exporting same data frame as above but with cast added 
actors = movies_keywords_cast.drop(['id', 'vote_average', 'vote_count', 'genres', 'gender', 'keywords', 'release_date'], axis=1)
actors = actors.drop(['number_of_males', 'number_of_females', 'number_of_unknown', 'number_of_cast'], axis=1)
actors.to_csv("actors.csv")

In [23]:
#adding a column in the movies_one_genre dataframe where each genre is assigned a number to make analysis possible
movies_one_genre.head()

Unnamed: 0,title,revenue,budget,weighted_rating,genres,keywords,release_date
0,Toy Story,373554033,30000000,7.447008,Animation,"[jealousy, toy, boy, friendship, friends, riva...",1995-10-30
1,Jumanji,262797249,65000000,6.626929,Adventure,"[board game, disappearance, based on children'...",1995-12-15
2,Waiting to Exhale,81452156,16000000,5.87567,Comedy,"[based on novel, interracial relationship, sin...",1995-12-22
3,Heat,187436818,60000000,7.122823,Action,"[robbery, detective, bank, obsession, chase, s...",1995-12-15
4,Sudden Death,64350171,35000000,5.805042,Action,"[terrorist, hostage, explosive, vice president]",1995-12-22


In [25]:
#adding the new column
movies_one_genre['numerical_value_genre'] = movies_one_genre['genres'].copy()
movies_one_genre = movies_one_genre[['title', 'revenue', 'budget', 'weighted_rating', 'genres', 'keywords', 'release_date', 'numerical_value_genre']]
movies_one_genre.head()

Unnamed: 0,title,revenue,budget,weighted_rating,genres,keywords,release_date,numerical_value_genre
0,Toy Story,373554033,30000000,7.447008,Animation,"[jealousy, toy, boy, friendship, friends, riva...",1995-10-30,Animation
1,Jumanji,262797249,65000000,6.626929,Adventure,"[board game, disappearance, based on children'...",1995-12-15,Adventure
2,Waiting to Exhale,81452156,16000000,5.87567,Comedy,"[based on novel, interracial relationship, sin...",1995-12-22,Comedy
3,Heat,187436818,60000000,7.122823,Action,"[robbery, detective, bank, obsession, chase, s...",1995-12-15,Action
4,Sudden Death,64350171,35000000,5.805042,Action,"[terrorist, hostage, explosive, vice president]",1995-12-22,Action


In [28]:
movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].astype(str)


for x in movies_one_genre['numerical_value_genre']:
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Action','1')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Adventure','2')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Animation','3')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Comedy','4')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Crime','5')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Documentary','6')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Drama','7')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Family','8')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Fantasy','9')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('History','10')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Horror','11')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Music','12')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Mystery','13')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Romance','14')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Science Fiction','15')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Thriller','16')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('War','17')
    movies_one_genre['numerical_value_genre'] = movies_one_genre['numerical_value_genre'].str.replace('Western','18')

       
movies_one_genre.head()   

KeyboardInterrupt: 

In [None]:
genre_as_number = movies_one_genre
genre_as_number = genre_as_number[['revenue', 'genres', 'numerical_value_genre']]
genre_as_number.head()

In [None]:
genre_as_number.to_csv('genre_as_number.csv')