In [147]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

# Importing the movie dataset

movie = pd.read_csv('../../data/movie.metadata.tsv', sep= '\t', header=None)

movie.columns = ['WikiID', 'FreebaseID', 'Name', 'ReleaseDate',
                     'Revenue', 'Runtime', 'Languages_tuple', 'Countries_tuples', 'Genres_tuples']

movie.head()


Unnamed: 0,WikiID,FreebaseID,Name,ReleaseDate,Revenue,Runtime,Languages_tuple,Countries_tuples,Genres_tuples
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [148]:
print(movie.shape)
print(movie.dtypes)

(81741, 9)
WikiID                int64
FreebaseID           object
Name                 object
ReleaseDate          object
Revenue             float64
Runtime             float64
Languages_tuple      object
Countries_tuples     object
Genres_tuples        object
dtype: object


In [149]:
movie['Name'].iloc[0]

'Ghosts of Mars'

In [150]:
#a lot of missing values 

print((pd.isna(movie.iloc[:,:6]).sum())/len(movie))
print((movie.iloc[:, 6:] == '{}').sum()/len(movie))



WikiID         0.000000
FreebaseID     0.000000
Name           0.000000
ReleaseDate    0.084437
Revenue        0.897224
Runtime        0.250180
dtype: float64
Languages_tuple     0.169633
Countries_tuples    0.099754
Genres_tuples       0.028064
dtype: float64


 - We miss 90% of revenues, if we want to use we need to integrate them from another dataset.
 



In [151]:
#turn the release date into a datetime object
movie['ReleaseDate'] = pd.to_datetime(movie['ReleaseDate'], errors='coerce')


In [152]:
#The tuple in the dataset are dictionary in string format.
# Need a function to convert them to dictionary
import ast

def conv_to_dict(val): 
    try:
        return ast.literal_eval(val) if pd.notna(val) else {}
    except (ValueError, SyntaxError):
        return {}
    
movie['Languages'] = movie['Languages_tuple'].apply(conv_to_dict)
movie['Countries'] = movie['Countries_tuples'].apply(conv_to_dict)
movie['Genres'] = movie['Genres_tuples'].apply(conv_to_dict)    


In [153]:
# Now we can work directly with the dictionaries inside the columns
#Ex listing the languages for each movie
movie['Languages'].apply(lambda x: list(x.values())).head(25)

0                  [English Language]
1                  [English Language]
2                [Norwegian Language]
3                  [English Language]
4                   [German Language]
5     [Silent film, English Language]
6                  [English Language]
7                  [English Language]
8                  [Spanish Language]
9                  [English Language]
10                 [English Language]
11                [Japanese Language]
12                 [English Language]
13                 [English Language]
14                 [Turkish Language]
15                 [English Language]
16                                 []
17                 [English Language]
18                  [German Language]
19                 [English Language]
20                 [English Language]
21                 [English Language]
22                                 []
23                  [German Language]
24                 [Russian Language]
Name: Languages, dtype: object

In [154]:
#importing the movies plot dataset

plots = pd.read_csv('../../data/plot_summaries.txt', sep='\t', header=None)
plots.columns = ['WikiID', 'Plot']

plots.head()

Unnamed: 0,WikiID,Plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [155]:

#importing the character dataset

characters = pd.read_csv('../../data/character.metadata.tsv', sep= '\t', header=None)
characters.columns = ['WikiMovieID', 'FreebaseMovieID', 'MovieReleaseDate','CharacterName',
                      'ActorBirthDate', 'ActorGender', 'ActorHeight','ActorEthnicity', 
                      'ActorName', 'ActoreAge', 'CharacterActorMap', 'CharacterID', 'ActorID']

characters.head()


Unnamed: 0,WikiMovieID,FreebaseMovieID,MovieReleaseDate,CharacterName,ActorBirthDate,ActorGender,ActorHeight,ActorEthnicity,ActorName,ActoreAge,CharacterActorMap,CharacterID,ActorID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [156]:
characters.dtypes

WikiMovieID            int64
FreebaseMovieID       object
MovieReleaseDate      object
CharacterName         object
ActorBirthDate        object
ActorGender           object
ActorHeight          float64
ActorEthnicity        object
ActorName             object
ActoreAge            float64
CharacterActorMap     object
CharacterID           object
ActorID               object
dtype: object

In [157]:
characters['MovieReleaseDate'] = pd.to_datetime(characters['MovieReleaseDate'], errors='coerce')

In [158]:
pd.isna(characters).sum()/len(characters)

WikiMovieID          0.000000
FreebaseMovieID      0.000000
MovieReleaseDate     0.392299
CharacterName        0.572205
ActorBirthDate       0.235528
ActorGender          0.101203
ActorHeight          0.656457
ActorEthnicity       0.764665
ActorName            0.002725
ActoreAge            0.350841
CharacterActorMap    0.000000
CharacterID          0.572183
ActorID              0.001808
dtype: float64

In [159]:
#import the name clusters

name_clusters = pd.read_csv('../../data/name.clusters.txt', sep='\t', header=None)

name_clusters.columns = ['Instaces', 'CharacterActorMap']

print(name_clusters.shape)
name_clusters.head()

(2666, 2)


Unnamed: 0,Instaces,CharacterActorMap
0,Stuart Little,/m/0k3w9c
1,Stuart Little,/m/0k3wcx
2,Stuart Little,/m/0k3wbn
3,John Doe,/m/0jyg35
4,John Doe,/m/0k2_zn


In [160]:
#import the tvtropes clusters

tvtropes_clusters = pd.read_csv('../../data/tvtropes.clusters.txt', sep='\t', header=None)
tvtropes_clusters.columns = ['Instaces', 'Dictionary_str']

tvtropes_clusters.shape


(501, 2)

In [162]:
tvtropes_clusters['Dictionary'] = tvtropes_clusters['Dictionary_str'].apply(conv_to_dict)

#create column for each key in the dictionary

c = tvtropes_clusters['Dictionary'].apply(pd.Series)

tvtropes_new = pd.concat([tvtropes_clusters['Instaces'], c], axis=1)


In [164]:
tvtropes_new.head()

Unnamed: 0,Instaces,char,movie,id,actor
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader


In [165]:
pd.isna(tvtropes_new).sum()/len(tvtropes_new)

Instaces    0.0
char        0.0
movie       0.0
id          0.0
actor       0.0
dtype: float64