In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

In [8]:
films = pd.read_csv('data/IMDb_Data_final.csv')

# data is from: https://www.kaggle.com/datasets/digvijaysinhgohil/imdb-dataset-toprated-films-18982022/data

In [9]:
films.head()

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear
0,Top Gun: Maverick,JosephKosinski,"TomCruise, JenniferConnelly, MilesTeller, ValK...",8.6,"Action,Drama",130min,UA,2022
1,Everything Everywhere All at Once,"DanKwan,",", MichelleYeoh, StephanieHsu, KeHuyQuan, James...",8.3,"Action,Adventure,Comedy",139min,R,2022
2,The Batman,MattReeves,"RobertPattinson, ZoëKravitz, JeffreyWright, Co...",7.9,"Action,Crime,Drama",176min,UA,2022
3,Jurassic Park,StevenSpielberg,"SamNeill, LauraDern, JeffGoldblum, RichardAtte...",8.2,"Action,Adventure,Sci-Fi",127min,UA,1993
4,The Godfather,FrancisFordCoppola,"MarlonBrando, AlPacino, JamesCaan, DianeKeaton",9.2,"Crime,Drama",175min,A,1972


In [10]:
films.dtypes

Title                   object
Director                object
Stars                   object
IMDb-Rating            float64
Category                object
Duration                object
Censor-board-rating     object
ReleaseYear              int64
dtype: object

In [11]:
films.isna().sum()

Title                    0
Director                 0
Stars                    0
IMDb-Rating              0
Category                 1
Duration                 1
Censor-board-rating    154
ReleaseYear              0
dtype: int64

In [12]:
films.columns

Index(['Title', 'Director', 'Stars', 'IMDb-Rating', 'Category', 'Duration',
       'Censor-board-rating', 'ReleaseYear'],
      dtype='object')

In [13]:
films = films[['Title', 'Director', 'Stars', 'IMDb-Rating', 'Category', 'Duration', 'ReleaseYear']]

In [14]:
films[films['Duration'].isnull()]
# I am going to fill in the NaN value for the filme Ayla: The Daughter of War. Duration is 124 min


Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,ReleaseYear
785,Ayla: The Daughter of War,"CanUlkay,",", ÇetinTekindor, IsmailHacioglu, Kyung-jinLee,...",8.3,"Biography,Drama,History",,2017


In [15]:
films.loc[785,'Duration'] = "124min"
films.loc[[785]]

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,ReleaseYear
785,Ayla: The Daughter of War,"CanUlkay,",", ÇetinTekindor, IsmailHacioglu, Kyung-jinLee,...",8.3,"Biography,Drama,History",124min,2017


In [16]:
films[films['Category'].isna()]
# there is an issue here as the Genre categories are actually in the Duration column

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,ReleaseYear
639,Chun gwong cha sit,Kar-WaiWong,"LeslieCheung, TonyChiu-WaiLeung, ChangChen, Gr...",7.7,,"Drama,Romance",1997


In [17]:
films.loc[639, 'Category'] = 'Drama,Romance'
films.loc[639, 'Duration'] = '96min'
films.loc[[639]]

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,ReleaseYear
639,Chun gwong cha sit,Kar-WaiWong,"LeslieCheung, TonyChiu-WaiLeung, ChangChen, Gr...",7.7,"Drama,Romance",96min,1997


In [18]:
films[films['Duration'].str.contains("min")]
# there are 1000 rows with the unit 'min' in the Duration column. So as all the units are the same, I will strip the units off and cast the values from string into integers.

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,ReleaseYear
0,Top Gun: Maverick,JosephKosinski,"TomCruise, JenniferConnelly, MilesTeller, ValK...",8.6,"Action,Drama",130min,2022
1,Everything Everywhere All at Once,"DanKwan,",", MichelleYeoh, StephanieHsu, KeHuyQuan, James...",8.3,"Action,Adventure,Comedy",139min,2022
2,The Batman,MattReeves,"RobertPattinson, ZoëKravitz, JeffreyWright, Co...",7.9,"Action,Crime,Drama",176min,2022
3,Jurassic Park,StevenSpielberg,"SamNeill, LauraDern, JeffGoldblum, RichardAtte...",8.2,"Action,Adventure,Sci-Fi",127min,1993
4,The Godfather,FrancisFordCoppola,"MarlonBrando, AlPacino, JamesCaan, DianeKeaton",9.2,"Crime,Drama",175min,1972
...,...,...,...,...,...,...,...
995,Vizontele,"YilmazErdogan,",", YilmazErdogan, DemetAkbag, AltanErkekli, Cem...",8.0,"Comedy,Drama",110min,2001
996,Sarfarosh,JohnMathewMatthan,"AamirKhan, NaseeruddinShah, SonaliBendre, Muke...",8.1,"Action,Drama,Thriller",174min,1999
997,Udaan,VikramadityaMotwane,"RajatBarmecha, RonitRoy, ManjotSingh, RamKapoor",8.1,Drama,134min,2010
998,English Vinglish,GauriShinde,"Sridevi, AdilHussain, MehdiNebbou, PriyaAnand",7.8,"Comedy,Drama,Family",134min,2012


In [19]:
films['Duration'] = films['Duration'].str.replace('min', "", regex=False).astype(int)
films.head()

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,ReleaseYear
0,Top Gun: Maverick,JosephKosinski,"TomCruise, JenniferConnelly, MilesTeller, ValK...",8.6,"Action,Drama",130,2022
1,Everything Everywhere All at Once,"DanKwan,",", MichelleYeoh, StephanieHsu, KeHuyQuan, James...",8.3,"Action,Adventure,Comedy",139,2022
2,The Batman,MattReeves,"RobertPattinson, ZoëKravitz, JeffreyWright, Co...",7.9,"Action,Crime,Drama",176,2022
3,Jurassic Park,StevenSpielberg,"SamNeill, LauraDern, JeffGoldblum, RichardAtte...",8.2,"Action,Adventure,Sci-Fi",127,1993
4,The Godfather,FrancisFordCoppola,"MarlonBrando, AlPacino, JamesCaan, DianeKeaton",9.2,"Crime,Drama",175,1972


In [20]:
films = films.rename(columns = {
    'IMDb-Rating':'IMDb_rating',
    'Category':'Genre',
    'Duration':'Duration_min',
    'ReleaseYear':'Release_year'
})
films.head(2)

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year
0,Top Gun: Maverick,JosephKosinski,"TomCruise, JenniferConnelly, MilesTeller, ValK...",8.6,"Action,Drama",130,2022
1,Everything Everywhere All at Once,"DanKwan,",", MichelleYeoh, StephanieHsu, KeHuyQuan, James...",8.3,"Action,Adventure,Comedy",139,2022


In [21]:
films.isna().sum()

Title           0
Director        0
Stars           0
IMDb_rating     0
Genre           0
Duration_min    0
Release_year    0
dtype: int64

In [22]:
films2 = films.copy()

Next, I will split the 'Stars' column and keep the first 2 actors to understand if films with particular actors lead to higher IMDb ratings.
The Stars column contains a list of names separated by commas. However there are some inconsitent formats such as leading commas and leading spaces.

In [23]:
cleaned = (
    films2['Stars']
    .str.strip()                 # remove surrounding spaces
    .str.lstrip(', ')            # remove leading commas & spaces
    .str.rstrip(', '))            # remove trailing commas & spaces
cleaned

0      TomCruise, JenniferConnelly, MilesTeller, ValK...
1       MichelleYeoh, StephanieHsu, KeHuyQuan, JamesHong
2      RobertPattinson, ZoëKravitz, JeffreyWright, Co...
3      SamNeill, LauraDern, JeffGoldblum, RichardAtte...
4         MarlonBrando, AlPacino, JamesCaan, DianeKeaton
                             ...                        
995    YilmazErdogan, DemetAkbag, AltanErkekli, CemYi...
996    AamirKhan, NaseeruddinShah, SonaliBendre, Muke...
997      RajatBarmecha, RonitRoy, ManjotSingh, RamKapoor
998        Sridevi, AdilHussain, MehdiNebbou, PriyaAnand
999    RajeshKhanna, AmitabhBachchan, SumitaSanyal, R...
Name: Stars, Length: 1000, dtype: object

In [24]:
films2[['Actor1', 'Actor2', 'Actor3', 'Actor4']] = cleaned.str.split(r'\s*,\s*', expand = True, n=3)
films2.head(5)

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2,Actor3,Actor4
0,Top Gun: Maverick,JosephKosinski,"TomCruise, JenniferConnelly, MilesTeller, ValK...",8.6,"Action,Drama",130,2022,TomCruise,JenniferConnelly,MilesTeller,ValKilmer
1,Everything Everywhere All at Once,"DanKwan,",", MichelleYeoh, StephanieHsu, KeHuyQuan, James...",8.3,"Action,Adventure,Comedy",139,2022,MichelleYeoh,StephanieHsu,KeHuyQuan,JamesHong
2,The Batman,MattReeves,"RobertPattinson, ZoëKravitz, JeffreyWright, Co...",7.9,"Action,Crime,Drama",176,2022,RobertPattinson,ZoëKravitz,JeffreyWright,ColinFarrell
3,Jurassic Park,StevenSpielberg,"SamNeill, LauraDern, JeffGoldblum, RichardAtte...",8.2,"Action,Adventure,Sci-Fi",127,1993,SamNeill,LauraDern,JeffGoldblum,RichardAttenborough
4,The Godfather,FrancisFordCoppola,"MarlonBrando, AlPacino, JamesCaan, DianeKeaton",9.2,"Crime,Drama",175,1972,MarlonBrando,AlPacino,JamesCaan,DianeKeaton


In [25]:
films2.isna().sum()

Title           0
Director        0
Stars           0
IMDb_rating     0
Genre           0
Duration_min    0
Release_year    0
Actor1          0
Actor2          0
Actor3          0
Actor4          0
dtype: int64

In [26]:
check = films2[films2['Actor1'].str.contains('"')].copy()
check

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2,Actor3,Actor4
344,Lawrence of Arabia,DavidLean,"""PeterOToole,"", AlecGuinness, AnthonyQuinn, Ja...",8.3,"Adventure,Biography,Drama",218,1962,"""PeterOToole","""",AlecGuinness,"AnthonyQuinn, JackHawkins"
354,Beauty and the Beast,"GaryTrousdale,",", ""PaigeOHara,"", RobbyBenson, JesseCorti, RexE...",8.0,"Animation,Family,Fantasy",84,1991,"""PaigeOHara","""",RobbyBenson,"JesseCorti, RexEverhart"
358,Straight Outta Compton,F.GaryGray,"""OSheaJacksonJr.,"", CoreyHawkins, JasonMitchel...",7.8,"Biography,Drama,History",147,2015,"""OSheaJacksonJr.","""",CoreyHawkins,"JasonMitchell, NeilBrownJr."
466,Barry Lyndon,StanleyKubrick,"""RyanONeal,"", MarisaBerenson, PatrickMagee, Ha...",8.1,"Adventure,Drama,War",185,1975,"""RyanONeal","""",MarisaBerenson,"PatrickMagee, HardyKrüger"
527,Paper Moon,PeterBogdanovich,"""RyanONeal,"", ""TatumONeal,"", MadelineKahn, Joh...",8.1,"Comedy,Crime,Drama",102,1973,"""RyanONeal","""","""TatumONeal",""", MadelineKahn, JohnHillerman"
593,Clerks,KevinSmith,"""BrianOHalloran,"", JeffAnderson, MarilynGhigli...",7.7,Comedy,92,1994,"""BrianOHalloran","""",JeffAnderson,"MarilynGhigliotti, LisaSpoonauer"
770,The Lion in Winter,AnthonyHarvey,"""PeterOToole,"", KatharineHepburn, AnthonyHopki...",7.9,"Biography,Drama,History",134,1968,"""PeterOToole","""",KatharineHepburn,"AnthonyHopkins, JohnCastle"
836,Underground,EmirKusturica,"""PredragMikiManojlovic,"", LazarRistovski, Mirj...",8.1,"Comedy,Drama,Fantasy",170,1995,"""PredragMikiManojlovic","""",LazarRistovski,"MirjanaJokovic, SlavkoStimac"
872,Sunrise: A Song of Two Humans,F.W.Murnau,"""GeorgeOBrien,"", JanetGaynor, MargaretLivingst...",8.1,"Drama,Romance",94,1927,"""GeorgeOBrien","""",JanetGaynor,"MargaretLivingston, BodilRosing"


In [27]:
films2['Actor1'] = films2['Actor1'].str.lstrip('"')           
films2.loc[[344,354,358]]
        

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2,Actor3,Actor4
344,Lawrence of Arabia,DavidLean,"""PeterOToole,"", AlecGuinness, AnthonyQuinn, Ja...",8.3,"Adventure,Biography,Drama",218,1962,PeterOToole,"""",AlecGuinness,"AnthonyQuinn, JackHawkins"
354,Beauty and the Beast,"GaryTrousdale,",", ""PaigeOHara,"", RobbyBenson, JesseCorti, RexE...",8.0,"Animation,Family,Fantasy",84,1991,PaigeOHara,"""",RobbyBenson,"JesseCorti, RexEverhart"
358,Straight Outta Compton,F.GaryGray,"""OSheaJacksonJr.,"", CoreyHawkins, JasonMitchel...",7.8,"Biography,Drama,History",147,2015,OSheaJacksonJr.,"""",CoreyHawkins,"JasonMitchell, NeilBrownJr."


-
-
-




In [28]:
check2 = films2[films2['Actor2'].str.contains('"')].copy()
check2

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2,Actor3,Actor4
344,Lawrence of Arabia,DavidLean,"""PeterOToole,"", AlecGuinness, AnthonyQuinn, Ja...",8.3,"Adventure,Biography,Drama",218,1962,PeterOToole,"""",AlecGuinness,"AnthonyQuinn, JackHawkins"
354,Beauty and the Beast,"GaryTrousdale,",", ""PaigeOHara,"", RobbyBenson, JesseCorti, RexE...",8.0,"Animation,Family,Fantasy",84,1991,PaigeOHara,"""",RobbyBenson,"JesseCorti, RexEverhart"
358,Straight Outta Compton,F.GaryGray,"""OSheaJacksonJr.,"", CoreyHawkins, JasonMitchel...",7.8,"Biography,Drama,History",147,2015,OSheaJacksonJr.,"""",CoreyHawkins,"JasonMitchell, NeilBrownJr."
436,Scent of a Woman,MartinBrest,"AlPacino, ""ChrisODonnell,"", JamesRebhorn, Gabr...",8.0,Drama,156,1992,AlPacino,"""ChrisODonnell","""","JamesRebhorn, GabrielleAnwar"
456,Singin in the Rain,"StanleyDonen,",", GeneKelly, ""DonaldOConnor,"", DebbieReynolds,...",8.3,"Comedy,Musical,Romance",103,1952,GeneKelly,"""DonaldOConnor","""","DebbieReynolds, JeanHagen"
466,Barry Lyndon,StanleyKubrick,"""RyanONeal,"", MarisaBerenson, PatrickMagee, Ha...",8.1,"Adventure,Drama,War",185,1975,RyanONeal,"""",MarisaBerenson,"PatrickMagee, HardyKrüger"
527,Paper Moon,PeterBogdanovich,"""RyanONeal,"", ""TatumONeal,"", MadelineKahn, Joh...",8.1,"Comedy,Crime,Drama",102,1973,RyanONeal,"""","""TatumONeal",""", MadelineKahn, JohnHillerman"
562,Night of the Living Dead,GeorgeA.Romero,"DuaneJones, ""JudithODea,"", KarlHardman, Marily...",7.8,"Horror,Thriller",96,1968,DuaneJones,"""JudithODea","""","KarlHardman, MarilynEastman"
593,Clerks,KevinSmith,"""BrianOHalloran,"", JeffAnderson, MarilynGhigli...",7.7,Comedy,92,1994,BrianOHalloran,"""",JeffAnderson,"MarilynGhigliotti, LisaSpoonauer"
615,The Quiet Man,JohnFord,"JohnWayne, ""MaureenOHara,"", BarryFitzgerald, W...",7.7,"Comedy,Drama,Romance",129,1952,JohnWayne,"""MaureenOHara","""","BarryFitzgerald, WardBond"


In [29]:
check2['Actor2'] = check2['Actor2'].str.lstrip('"')           
check2
        

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2,Actor3,Actor4
344,Lawrence of Arabia,DavidLean,"""PeterOToole,"", AlecGuinness, AnthonyQuinn, Ja...",8.3,"Adventure,Biography,Drama",218,1962,PeterOToole,,AlecGuinness,"AnthonyQuinn, JackHawkins"
354,Beauty and the Beast,"GaryTrousdale,",", ""PaigeOHara,"", RobbyBenson, JesseCorti, RexE...",8.0,"Animation,Family,Fantasy",84,1991,PaigeOHara,,RobbyBenson,"JesseCorti, RexEverhart"
358,Straight Outta Compton,F.GaryGray,"""OSheaJacksonJr.,"", CoreyHawkins, JasonMitchel...",7.8,"Biography,Drama,History",147,2015,OSheaJacksonJr.,,CoreyHawkins,"JasonMitchell, NeilBrownJr."
436,Scent of a Woman,MartinBrest,"AlPacino, ""ChrisODonnell,"", JamesRebhorn, Gabr...",8.0,Drama,156,1992,AlPacino,ChrisODonnell,"""","JamesRebhorn, GabrielleAnwar"
456,Singin in the Rain,"StanleyDonen,",", GeneKelly, ""DonaldOConnor,"", DebbieReynolds,...",8.3,"Comedy,Musical,Romance",103,1952,GeneKelly,DonaldOConnor,"""","DebbieReynolds, JeanHagen"
466,Barry Lyndon,StanleyKubrick,"""RyanONeal,"", MarisaBerenson, PatrickMagee, Ha...",8.1,"Adventure,Drama,War",185,1975,RyanONeal,,MarisaBerenson,"PatrickMagee, HardyKrüger"
527,Paper Moon,PeterBogdanovich,"""RyanONeal,"", ""TatumONeal,"", MadelineKahn, Joh...",8.1,"Comedy,Crime,Drama",102,1973,RyanONeal,,"""TatumONeal",""", MadelineKahn, JohnHillerman"
562,Night of the Living Dead,GeorgeA.Romero,"DuaneJones, ""JudithODea,"", KarlHardman, Marily...",7.8,"Horror,Thriller",96,1968,DuaneJones,JudithODea,"""","KarlHardman, MarilynEastman"
593,Clerks,KevinSmith,"""BrianOHalloran,"", JeffAnderson, MarilynGhigli...",7.7,Comedy,92,1994,BrianOHalloran,,JeffAnderson,"MarilynGhigliotti, LisaSpoonauer"
615,The Quiet Man,JohnFord,"JohnWayne, ""MaureenOHara,"", BarryFitzgerald, W...",7.7,"Comedy,Drama,Romance",129,1952,JohnWayne,MaureenOHara,"""","BarryFitzgerald, WardBond"


In [30]:
check2[check2['Actor2'] == ""].count()

Title           9
Director        9
Stars           9
IMDb_rating     9
Genre           9
Duration_min    9
Release_year    9
Actor1          9
Actor2          9
Actor3          9
Actor4          9
dtype: int64

In [31]:
check2['Actor2'] = check2['Actor2'].replace(r'^\s*$', np.nan, regex=True)
check2


#check2['Actor2'] = check2['Actor2'].str.replace("", check2['Actor3'], regex=False)
#check2

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2,Actor3,Actor4
344,Lawrence of Arabia,DavidLean,"""PeterOToole,"", AlecGuinness, AnthonyQuinn, Ja...",8.3,"Adventure,Biography,Drama",218,1962,PeterOToole,,AlecGuinness,"AnthonyQuinn, JackHawkins"
354,Beauty and the Beast,"GaryTrousdale,",", ""PaigeOHara,"", RobbyBenson, JesseCorti, RexE...",8.0,"Animation,Family,Fantasy",84,1991,PaigeOHara,,RobbyBenson,"JesseCorti, RexEverhart"
358,Straight Outta Compton,F.GaryGray,"""OSheaJacksonJr.,"", CoreyHawkins, JasonMitchel...",7.8,"Biography,Drama,History",147,2015,OSheaJacksonJr.,,CoreyHawkins,"JasonMitchell, NeilBrownJr."
436,Scent of a Woman,MartinBrest,"AlPacino, ""ChrisODonnell,"", JamesRebhorn, Gabr...",8.0,Drama,156,1992,AlPacino,ChrisODonnell,"""","JamesRebhorn, GabrielleAnwar"
456,Singin in the Rain,"StanleyDonen,",", GeneKelly, ""DonaldOConnor,"", DebbieReynolds,...",8.3,"Comedy,Musical,Romance",103,1952,GeneKelly,DonaldOConnor,"""","DebbieReynolds, JeanHagen"
466,Barry Lyndon,StanleyKubrick,"""RyanONeal,"", MarisaBerenson, PatrickMagee, Ha...",8.1,"Adventure,Drama,War",185,1975,RyanONeal,,MarisaBerenson,"PatrickMagee, HardyKrüger"
527,Paper Moon,PeterBogdanovich,"""RyanONeal,"", ""TatumONeal,"", MadelineKahn, Joh...",8.1,"Comedy,Crime,Drama",102,1973,RyanONeal,,"""TatumONeal",""", MadelineKahn, JohnHillerman"
562,Night of the Living Dead,GeorgeA.Romero,"DuaneJones, ""JudithODea,"", KarlHardman, Marily...",7.8,"Horror,Thriller",96,1968,DuaneJones,JudithODea,"""","KarlHardman, MarilynEastman"
593,Clerks,KevinSmith,"""BrianOHalloran,"", JeffAnderson, MarilynGhigli...",7.7,Comedy,92,1994,BrianOHalloran,,JeffAnderson,"MarilynGhigliotti, LisaSpoonauer"
615,The Quiet Man,JohnFord,"JohnWayne, ""MaureenOHara,"", BarryFitzgerald, W...",7.7,"Comedy,Drama,Romance",129,1952,JohnWayne,MaureenOHara,"""","BarryFitzgerald, WardBond"


In [32]:
check2['Actor2'] = check2['Actor2'].fillna(check2['Actor3'])
check2

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2,Actor3,Actor4
344,Lawrence of Arabia,DavidLean,"""PeterOToole,"", AlecGuinness, AnthonyQuinn, Ja...",8.3,"Adventure,Biography,Drama",218,1962,PeterOToole,AlecGuinness,AlecGuinness,"AnthonyQuinn, JackHawkins"
354,Beauty and the Beast,"GaryTrousdale,",", ""PaigeOHara,"", RobbyBenson, JesseCorti, RexE...",8.0,"Animation,Family,Fantasy",84,1991,PaigeOHara,RobbyBenson,RobbyBenson,"JesseCorti, RexEverhart"
358,Straight Outta Compton,F.GaryGray,"""OSheaJacksonJr.,"", CoreyHawkins, JasonMitchel...",7.8,"Biography,Drama,History",147,2015,OSheaJacksonJr.,CoreyHawkins,CoreyHawkins,"JasonMitchell, NeilBrownJr."
436,Scent of a Woman,MartinBrest,"AlPacino, ""ChrisODonnell,"", JamesRebhorn, Gabr...",8.0,Drama,156,1992,AlPacino,ChrisODonnell,"""","JamesRebhorn, GabrielleAnwar"
456,Singin in the Rain,"StanleyDonen,",", GeneKelly, ""DonaldOConnor,"", DebbieReynolds,...",8.3,"Comedy,Musical,Romance",103,1952,GeneKelly,DonaldOConnor,"""","DebbieReynolds, JeanHagen"
466,Barry Lyndon,StanleyKubrick,"""RyanONeal,"", MarisaBerenson, PatrickMagee, Ha...",8.1,"Adventure,Drama,War",185,1975,RyanONeal,MarisaBerenson,MarisaBerenson,"PatrickMagee, HardyKrüger"
527,Paper Moon,PeterBogdanovich,"""RyanONeal,"", ""TatumONeal,"", MadelineKahn, Joh...",8.1,"Comedy,Crime,Drama",102,1973,RyanONeal,"""TatumONeal","""TatumONeal",""", MadelineKahn, JohnHillerman"
562,Night of the Living Dead,GeorgeA.Romero,"DuaneJones, ""JudithODea,"", KarlHardman, Marily...",7.8,"Horror,Thriller",96,1968,DuaneJones,JudithODea,"""","KarlHardman, MarilynEastman"
593,Clerks,KevinSmith,"""BrianOHalloran,"", JeffAnderson, MarilynGhigli...",7.7,Comedy,92,1994,BrianOHalloran,JeffAnderson,JeffAnderson,"MarilynGhigliotti, LisaSpoonauer"
615,The Quiet Man,JohnFord,"JohnWayne, ""MaureenOHara,"", BarryFitzgerald, W...",7.7,"Comedy,Drama,Romance",129,1952,JohnWayne,MaureenOHara,"""","BarryFitzgerald, WardBond"


In [33]:
check2['Actor2'] = check2['Actor2'].str.lstrip('"')
check2

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2,Actor3,Actor4
344,Lawrence of Arabia,DavidLean,"""PeterOToole,"", AlecGuinness, AnthonyQuinn, Ja...",8.3,"Adventure,Biography,Drama",218,1962,PeterOToole,AlecGuinness,AlecGuinness,"AnthonyQuinn, JackHawkins"
354,Beauty and the Beast,"GaryTrousdale,",", ""PaigeOHara,"", RobbyBenson, JesseCorti, RexE...",8.0,"Animation,Family,Fantasy",84,1991,PaigeOHara,RobbyBenson,RobbyBenson,"JesseCorti, RexEverhart"
358,Straight Outta Compton,F.GaryGray,"""OSheaJacksonJr.,"", CoreyHawkins, JasonMitchel...",7.8,"Biography,Drama,History",147,2015,OSheaJacksonJr.,CoreyHawkins,CoreyHawkins,"JasonMitchell, NeilBrownJr."
436,Scent of a Woman,MartinBrest,"AlPacino, ""ChrisODonnell,"", JamesRebhorn, Gabr...",8.0,Drama,156,1992,AlPacino,ChrisODonnell,"""","JamesRebhorn, GabrielleAnwar"
456,Singin in the Rain,"StanleyDonen,",", GeneKelly, ""DonaldOConnor,"", DebbieReynolds,...",8.3,"Comedy,Musical,Romance",103,1952,GeneKelly,DonaldOConnor,"""","DebbieReynolds, JeanHagen"
466,Barry Lyndon,StanleyKubrick,"""RyanONeal,"", MarisaBerenson, PatrickMagee, Ha...",8.1,"Adventure,Drama,War",185,1975,RyanONeal,MarisaBerenson,MarisaBerenson,"PatrickMagee, HardyKrüger"
527,Paper Moon,PeterBogdanovich,"""RyanONeal,"", ""TatumONeal,"", MadelineKahn, Joh...",8.1,"Comedy,Crime,Drama",102,1973,RyanONeal,TatumONeal,"""TatumONeal",""", MadelineKahn, JohnHillerman"
562,Night of the Living Dead,GeorgeA.Romero,"DuaneJones, ""JudithODea,"", KarlHardman, Marily...",7.8,"Horror,Thriller",96,1968,DuaneJones,JudithODea,"""","KarlHardman, MarilynEastman"
593,Clerks,KevinSmith,"""BrianOHalloran,"", JeffAnderson, MarilynGhigli...",7.7,Comedy,92,1994,BrianOHalloran,JeffAnderson,JeffAnderson,"MarilynGhigliotti, LisaSpoonauer"
615,The Quiet Man,JohnFord,"JohnWayne, ""MaureenOHara,"", BarryFitzgerald, W...",7.7,"Comedy,Drama,Romance",129,1952,JohnWayne,MaureenOHara,"""","BarryFitzgerald, WardBond"


In [34]:
films2[films2['Actor2'].str.contains('"')]

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2,Actor3,Actor4
344,Lawrence of Arabia,DavidLean,"""PeterOToole,"", AlecGuinness, AnthonyQuinn, Ja...",8.3,"Adventure,Biography,Drama",218,1962,PeterOToole,"""",AlecGuinness,"AnthonyQuinn, JackHawkins"
354,Beauty and the Beast,"GaryTrousdale,",", ""PaigeOHara,"", RobbyBenson, JesseCorti, RexE...",8.0,"Animation,Family,Fantasy",84,1991,PaigeOHara,"""",RobbyBenson,"JesseCorti, RexEverhart"
358,Straight Outta Compton,F.GaryGray,"""OSheaJacksonJr.,"", CoreyHawkins, JasonMitchel...",7.8,"Biography,Drama,History",147,2015,OSheaJacksonJr.,"""",CoreyHawkins,"JasonMitchell, NeilBrownJr."
436,Scent of a Woman,MartinBrest,"AlPacino, ""ChrisODonnell,"", JamesRebhorn, Gabr...",8.0,Drama,156,1992,AlPacino,"""ChrisODonnell","""","JamesRebhorn, GabrielleAnwar"
456,Singin in the Rain,"StanleyDonen,",", GeneKelly, ""DonaldOConnor,"", DebbieReynolds,...",8.3,"Comedy,Musical,Romance",103,1952,GeneKelly,"""DonaldOConnor","""","DebbieReynolds, JeanHagen"
466,Barry Lyndon,StanleyKubrick,"""RyanONeal,"", MarisaBerenson, PatrickMagee, Ha...",8.1,"Adventure,Drama,War",185,1975,RyanONeal,"""",MarisaBerenson,"PatrickMagee, HardyKrüger"
527,Paper Moon,PeterBogdanovich,"""RyanONeal,"", ""TatumONeal,"", MadelineKahn, Joh...",8.1,"Comedy,Crime,Drama",102,1973,RyanONeal,"""","""TatumONeal",""", MadelineKahn, JohnHillerman"
562,Night of the Living Dead,GeorgeA.Romero,"DuaneJones, ""JudithODea,"", KarlHardman, Marily...",7.8,"Horror,Thriller",96,1968,DuaneJones,"""JudithODea","""","KarlHardman, MarilynEastman"
593,Clerks,KevinSmith,"""BrianOHalloran,"", JeffAnderson, MarilynGhigli...",7.7,Comedy,92,1994,BrianOHalloran,"""",JeffAnderson,"MarilynGhigliotti, LisaSpoonauer"
615,The Quiet Man,JohnFord,"JohnWayne, ""MaureenOHara,"", BarryFitzgerald, W...",7.7,"Comedy,Drama,Romance",129,1952,JohnWayne,"""MaureenOHara","""","BarryFitzgerald, WardBond"


In [35]:
films2['Actor2'] = films2['Actor2'].str.lstrip('"').replace(r'^\s*$', np.nan, regex=True).fillna(films2['Actor3'])
films2.loc[[344,354,436]]

Unnamed: 0,Title,Director,Stars,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2,Actor3,Actor4
344,Lawrence of Arabia,DavidLean,"""PeterOToole,"", AlecGuinness, AnthonyQuinn, Ja...",8.3,"Adventure,Biography,Drama",218,1962,PeterOToole,AlecGuinness,AlecGuinness,"AnthonyQuinn, JackHawkins"
354,Beauty and the Beast,"GaryTrousdale,",", ""PaigeOHara,"", RobbyBenson, JesseCorti, RexE...",8.0,"Animation,Family,Fantasy",84,1991,PaigeOHara,RobbyBenson,RobbyBenson,"JesseCorti, RexEverhart"
436,Scent of a Woman,MartinBrest,"AlPacino, ""ChrisODonnell,"", JamesRebhorn, Gabr...",8.0,Drama,156,1992,AlPacino,ChrisODonnell,"""","JamesRebhorn, GabrielleAnwar"


In [36]:
films2.columns

Index(['Title', 'Director', 'Stars', 'IMDb_rating', 'Genre', 'Duration_min',
       'Release_year', 'Actor1', 'Actor2', 'Actor3', 'Actor4'],
      dtype='object')

In [37]:
films_clean = films2[['Title', 'Director', 'IMDb_rating', 'Genre', 'Duration_min', 'Release_year', 'Actor1', 'Actor2']].copy()

In [38]:
films_clean.sample(10)

Unnamed: 0,Title,Director,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2
400,Falling Down,JoelSchumacher,7.6,"Action,Crime,Drama",113,1993,MichaelDouglas,RobertDuvall
263,Sin City,"FrankMiller,",8.0,"Crime,Thriller",124,2005,RobertRodriguez,MickeyRourke
346,Dark Waters,ToddHaynes,7.6,"Biography,Drama,History",126,2019,MarkRuffalo,AnneHathaway
358,Straight Outta Compton,F.GaryGray,7.8,"Biography,Drama,History",147,2015,OSheaJacksonJr.,CoreyHawkins
231,The Game,DavidFincher,7.7,"Drama,Mystery,Thriller",129,1997,MichaelDouglas,DeborahKaraUnger
465,Midnight Cowboy,JohnSchlesinger,7.8,Drama,113,1969,DustinHoffman,JonVoight
755,Control,AntonCorbijn,7.7,"Biography,Drama,Music",122,2007,SamRiley,SamanthaMorton
920,Sennen joyû,SatoshiKon,7.8,"Animation,Drama,Fantasy",87,2001,MiyokoShôji,ShôzôÎzuka
501,Per un pugno di dollari,SergioLeone,7.9,"Action,Drama,Western",99,1964,ClintEastwood,GianMariaVolontè
156,La vie dAdèle,AbdellatifKechiche,7.7,"Drama,Romance",180,2013,LéaSeydoux,AdèleExarchopoulos


In [39]:
films_genre = films_clean.copy()
films_genre

Unnamed: 0,Title,Director,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2
0,Top Gun: Maverick,JosephKosinski,8.6,"Action,Drama",130,2022,TomCruise,JenniferConnelly
1,Everything Everywhere All at Once,"DanKwan,",8.3,"Action,Adventure,Comedy",139,2022,MichelleYeoh,StephanieHsu
2,The Batman,MattReeves,7.9,"Action,Crime,Drama",176,2022,RobertPattinson,ZoëKravitz
3,Jurassic Park,StevenSpielberg,8.2,"Action,Adventure,Sci-Fi",127,1993,SamNeill,LauraDern
4,The Godfather,FrancisFordCoppola,9.2,"Crime,Drama",175,1972,MarlonBrando,AlPacino
...,...,...,...,...,...,...,...,...
995,Vizontele,"YilmazErdogan,",8.0,"Comedy,Drama",110,2001,YilmazErdogan,DemetAkbag
996,Sarfarosh,JohnMathewMatthan,8.1,"Action,Drama,Thriller",174,1999,AamirKhan,NaseeruddinShah
997,Udaan,VikramadityaMotwane,8.1,Drama,134,2010,RajatBarmecha,RonitRoy
998,English Vinglish,GauriShinde,7.8,"Comedy,Drama,Family",134,2012,Sridevi,AdilHussain


In [40]:
films_genre['Genre'].value_counts().sample(20)

Genre
Adventure,Horror,Sci-Fi      1
Comedy,Crime,Drama          12
Drama,Sci-Fi                 5
Mystery,Sci-Fi,Thriller      1
Adventure,Western            1
Drama,History,Thriller       5
Action,Comedy,Horror         1
Drama,Fantasy,War            1
Drama,Mystery,Romance        6
Drama,Family,Sport           2
Action,Crime,Comedy          1
Action,Crime,Sci-Fi          1
Action,Drama,War             3
Action,Adventure,Fantasy     8
Drama,Film-Noir,Romance      1
Action,Drama,History         6
Adventure,Drama,Fantasy      4
Crime,Thriller               4
Comedy,Music,Musical         1
Comedy,Crime,Thriller        1
Name: count, dtype: int64

In [41]:
films_genre["Genre"] = films_genre["Genre"].map(str)
films_genre["Genre"] = films_genre["Genre"].str.split(",")
#films_genre = films_genre.explode("Genre")

In [42]:
films_genre

Unnamed: 0,Title,Director,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2
0,Top Gun: Maverick,JosephKosinski,8.6,"[Action, Drama]",130,2022,TomCruise,JenniferConnelly
1,Everything Everywhere All at Once,"DanKwan,",8.3,"[Action, Adventure, Comedy]",139,2022,MichelleYeoh,StephanieHsu
2,The Batman,MattReeves,7.9,"[Action, Crime, Drama]",176,2022,RobertPattinson,ZoëKravitz
3,Jurassic Park,StevenSpielberg,8.2,"[Action, Adventure, Sci-Fi]",127,1993,SamNeill,LauraDern
4,The Godfather,FrancisFordCoppola,9.2,"[Crime, Drama]",175,1972,MarlonBrando,AlPacino
...,...,...,...,...,...,...,...,...
995,Vizontele,"YilmazErdogan,",8.0,"[Comedy, Drama]",110,2001,YilmazErdogan,DemetAkbag
996,Sarfarosh,JohnMathewMatthan,8.1,"[Action, Drama, Thriller]",174,1999,AamirKhan,NaseeruddinShah
997,Udaan,VikramadityaMotwane,8.1,[Drama],134,2010,RajatBarmecha,RonitRoy
998,English Vinglish,GauriShinde,7.8,"[Comedy, Drama, Family]",134,2012,Sridevi,AdilHussain


In [43]:
films_genre_final = films_genre.explode("Genre")

In [44]:
films_genre_final.head(10)

Unnamed: 0,Title,Director,IMDb_rating,Genre,Duration_min,Release_year,Actor1,Actor2
0,Top Gun: Maverick,JosephKosinski,8.6,Action,130,2022,TomCruise,JenniferConnelly
0,Top Gun: Maverick,JosephKosinski,8.6,Drama,130,2022,TomCruise,JenniferConnelly
1,Everything Everywhere All at Once,"DanKwan,",8.3,Action,139,2022,MichelleYeoh,StephanieHsu
1,Everything Everywhere All at Once,"DanKwan,",8.3,Adventure,139,2022,MichelleYeoh,StephanieHsu
1,Everything Everywhere All at Once,"DanKwan,",8.3,Comedy,139,2022,MichelleYeoh,StephanieHsu
2,The Batman,MattReeves,7.9,Action,176,2022,RobertPattinson,ZoëKravitz
2,The Batman,MattReeves,7.9,Crime,176,2022,RobertPattinson,ZoëKravitz
2,The Batman,MattReeves,7.9,Drama,176,2022,RobertPattinson,ZoëKravitz
3,Jurassic Park,StevenSpielberg,8.2,Action,127,1993,SamNeill,LauraDern
3,Jurassic Park,StevenSpielberg,8.2,Adventure,127,1993,SamNeill,LauraDern


In [45]:
films_clean = films_clean.to_pickle('data/films_clean_data.pkl')

In [46]:
films_genre_final = films_genre_final.to_pickle('data/films_genre_data.pkl')