# Setup


In [1]:
import pandas as pd
import json

# Importing the datasets


In [2]:
movies_path = "/content/drive/MyDrive/ADA/Project/data/movie.metadata.tsv"
character_path = "/content/drive/MyDrive/ADA/Project/data/character.metadata.tsv"

movie_column_names = ["WikiID", "FreeID", "Title", "RelDate", "Revenue", "Runtime", "Languages", "Countries", "Genres"]
character_column_names = ["WikiID", "FreeID", "MovieRelDate", "CharName", "DOB", "Gender", "Height", "Ethnicity", "Actor", "Age", "FreeMapID", "FreeCharID", "FreeActorID"]

movies = pd.read_csv(movies_path, sep='\t', header=None, names=movie_column_names)
characters = pd.read_csv(character_path, sep='\t', header=None, names=character_column_names)

# Looking at the data

In [3]:
display(movies.head())
display(characters.head())

Unnamed: 0,WikiID,FreeID,Title,RelDate,Revenue,Runtime,Languages,Countries,Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


Unnamed: 0,WikiID,FreeID,MovieRelDate,CharName,DOB,Gender,Height,Ethnicity,Actor,Age,FreeMapID,FreeCharID,FreeActorID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg




*   Note : `RelDate`doesn't have a standard format



# Cleaning Dictionaries
Columns `Languages`, `Countries`, `Genres` contain wikipedia ID + actual name for each entry. For the moment, keeping just the name for the sake of clarity

In [4]:
def extract_values(column):
  values = []
  column=json.loads(column)
  for key in column:
    values.append(column[key])
  return values

movies.Languages = movies.Languages.apply(extract_values)
movies.Countries = movies.Countries.apply(extract_values)
movies.Genres = movies.Genres.apply(extract_values)

In [7]:
display(movies[["Languages", "Countries", "Genres"]].head(2))

Unnamed: 0,Languages,Countries,Genres
0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
1,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"


# Duplicates

In [8]:
for column in movies.columns:
  duplicated_rows = movies[column].duplicated().sum()
  print("{} has {} duplicated rows".format(column, duplicated_rows))

WikiID has 0 duplicated rows
FreeID has 0 duplicated rows
Title has 6263 duplicated rows
RelDate has 61351 duplicated rows
Revenue has 74378 duplicated rows
Runtime has 81143 duplicated rows
Languages has 79924 duplicated rows
Countries has 79617 duplicated rows
Genres has 57924 duplicated rows


`WikiID`& `FreeID`-> Good

`Title`-> Bit concerning -> Check with Runtime


> ~~Runtime not enough. There are :~~

1.   ~~Movies with same runtime + Title but actually different~~
2.   ~~Movies with same runtime + Title but actually the same~~





In [None]:
movies[movies.Title=="Harlow"]

Unnamed: 0,WikiID,FreeID,Title,RelDate,Revenue,Runtime,Languages,Countries,Genres
623,3670013,/m/09thsq,Harlow,1965-06-23,1000000.0,109.0,[English Language],[United States of America],"[Biographical film, Biography, Drama, Black-an..."
1223,27171821,/m/0bwklv0,Harlow,1965,,109.0,[],[United States of America],[Biographical film]


Wrong input for release date (1010->2010)

In [None]:
movies[movies.Title=="Hunting Season"]

Unnamed: 0,WikiID,FreeID,Title,RelDate,Revenue,Runtime,Languages,Countries,Genres
62836,29666067,/m/0fphzrf,Hunting Season,1010-12-02,12160978.0,140.0,"[Turkish Language, English Language]",[Turkey],"[Crime Fiction, Mystery, Drama, Thriller]"


Actually these are 2 different movies BUT first movie `WikiID`=3670013 has wrong rutime (correct =125).

So far couldn't find an example of a duplicated movie + No duplicated Wiki/Freebase IDs -> Lets trust the dataset 🙃

---