In [2]:
import pandas as pd
import json
import ast

### Loading raw csv file into dataframe and checking characteristics

In [11]:
credits = pd.read_csv("data/raw/credits.csv")
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


### Important note: 
The file consists of two columns containing JSON objects. However, the JSON is not properly formatted. Therefore it is required to perform transformations to enable unpacking the data and loading it into database.

As a result, credits.csv has to be divided into two seperate files/tables, representing columns 'cast' and 'crew'.

### 1. Parsing "cast" column
After multiple unsuccessful tries to unpack JSON columns, the problem was found with one of the key:value pair.

In [10]:
credits["cast"] = credits["cast"].str.replace(": None", ": 'None'")
credits

TypeError: '_Printer' object is not subscriptable

In [4]:
cast_list = []

for entry in credits['cast']:
    if pd.notna(entry):
        try:
            cast_data = ast.literal_eval(entry)
            cast_list.append(cast_data)
        except (SyntaxError, ValueError):
            cast_list.append([])
    else:
        cast_list.append([])

credits['cast'] = cast_list

We received a list with cast, but still it is needed to link the data with movie id. For this purpose the index will be used.

In [6]:
cast_df = pd.DataFrame()
for i, item in enumerate(cast_list):
    df = pd.DataFrame(cast_list[i])
    df["index"] = i
    cast_df = pd.concat([cast_df, df], axis=0, ignore_index=True)

cast_df

Unnamed: 0,cast_id,character,credit_id,gender,id,name,order,profile_path,index
0,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,0
1,15.0,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2.0,12898.0,Tim Allen,1.0,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg,0
2,16.0,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2.0,7167.0,Don Rickles,2.0,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg,0
3,17.0,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2.0,12899.0,Jim Varney,3.0,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg,0
4,18.0,Rex (voice),52fe4284c3a36847f8024fa5,2.0,12900.0,Wallace Shawn,4.0,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg,0
...,...,...,...,...,...,...,...,...,...
562469,2.0,,52fe4ea59251416c7515d7d5,2.0,544742.0,Iwan Mosschuchin,0.0,,45474
562470,3.0,,52fe4ea59251416c7515d7d9,1.0,1090923.0,Nathalie Lissenko,1.0,,45474
562471,4.0,,52fe4ea59251416c7515d7dd,2.0,1136422.0,Pavel Pavlov,2.0,,45474
562472,5.0,,52fe4ea59251416c7515d7e1,0.0,1261758.0,Aleksandr Chabrov,3.0,,45474


In [5]:
movies_ids = credits[["id"]].reset_index()
cast_df = pd.merge(cast_df, movies_ids, how="left", on="index")
cast_df

Unnamed: 0,cast_id,character,credit_id,gender,id_x,name,order,profile_path,index,id_y
0,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,0,862
1,15.0,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2.0,12898.0,Tim Allen,1.0,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg,0,862
2,16.0,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2.0,7167.0,Don Rickles,2.0,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg,0,862
3,17.0,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2.0,12899.0,Jim Varney,3.0,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg,0,862
4,18.0,Rex (voice),52fe4284c3a36847f8024fa5,2.0,12900.0,Wallace Shawn,4.0,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg,0,862
...,...,...,...,...,...,...,...,...,...,...
562469,2.0,,52fe4ea59251416c7515d7d5,2.0,544742.0,Iwan Mosschuchin,0.0,,45474,227506
562470,3.0,,52fe4ea59251416c7515d7d9,1.0,1090923.0,Nathalie Lissenko,1.0,,45474,227506
562471,4.0,,52fe4ea59251416c7515d7dd,2.0,1136422.0,Pavel Pavlov,2.0,,45474,227506
562472,5.0,,52fe4ea59251416c7515d7e1,0.0,1261758.0,Aleksandr Chabrov,3.0,,45474,227506


In [12]:
#cast_result['id_y'] = cast_result['id_y'].rename('movie_id')
cast_df = cast_df.rename(columns={'id_y': 'movie_id', 'id_x': 'id'})
cast_df

Unnamed: 0,cast_id,character,credit_id,gender,id,name,order,profile_path,index,movie_id
0,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,0,862
1,15.0,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2.0,12898.0,Tim Allen,1.0,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg,0,862
2,16.0,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2.0,7167.0,Don Rickles,2.0,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg,0,862
3,17.0,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2.0,12899.0,Jim Varney,3.0,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg,0,862
4,18.0,Rex (voice),52fe4284c3a36847f8024fa5,2.0,12900.0,Wallace Shawn,4.0,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg,0,862
...,...,...,...,...,...,...,...,...,...,...
562469,2.0,,52fe4ea59251416c7515d7d5,2.0,544742.0,Iwan Mosschuchin,0.0,,45474,227506
562470,3.0,,52fe4ea59251416c7515d7d9,1.0,1090923.0,Nathalie Lissenko,1.0,,45474,227506
562471,4.0,,52fe4ea59251416c7515d7dd,2.0,1136422.0,Pavel Pavlov,2.0,,45474,227506
562472,5.0,,52fe4ea59251416c7515d7e1,0.0,1261758.0,Aleksandr Chabrov,3.0,,45474,227506


Since Index column was used only to perform join operation, it is not needed anymore.

In [4]:
cast_df = cast_df.drop('index', axis=1)
cast_df

Unnamed: 0,cast_id,character,credit_id,gender,id,name,order,profile_path,movie_id
0,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,862
1,15.0,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2.0,12898.0,Tim Allen,1.0,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg,862
2,16.0,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2.0,7167.0,Don Rickles,2.0,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg,862
3,17.0,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2.0,12899.0,Jim Varney,3.0,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg,862
4,18.0,Rex (voice),52fe4284c3a36847f8024fa5,2.0,12900.0,Wallace Shawn,4.0,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg,862
...,...,...,...,...,...,...,...,...,...
562469,2.0,,52fe4ea59251416c7515d7d5,2.0,544742.0,Iwan Mosschuchin,0.0,,227506
562470,3.0,,52fe4ea59251416c7515d7d9,1.0,1090923.0,Nathalie Lissenko,1.0,,227506
562471,4.0,,52fe4ea59251416c7515d7dd,2.0,1136422.0,Pavel Pavlov,2.0,,227506
562472,5.0,,52fe4ea59251416c7515d7e1,0.0,1261758.0,Aleksandr Chabrov,3.0,,227506


### Saving the output to dedicated catalog

In [5]:
cast_df.to_csv('data/ready_for_ingestion/cast.csv', index=False)

### 2. Parsing "crew" column

In [11]:
credits["crew"] = credits["crew"].str.replace(": None", ": 'None'")
credits

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862
...,...,...,...
45471,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506


In [13]:
crew_list = []

for entry in credits['crew']:
    if pd.notna(entry):
        try:
            crew_data = ast.literal_eval(entry)
            crew_list.append(crew_data)
        except (SyntaxError, ValueError):
            crew_list.append([])
    else:
        crew_list.append([])

df['crew'] = crew_list

In [14]:
crew_df = pd.DataFrame()
for i, item in enumerate(crew_list):
    df = pd.DataFrame(crew_list[i])
    df["index"] = i
    crew_df = pd.concat([crew_df, df], axis=0, ignore_index=True)

crew_df.head()

Unnamed: 0,credit_id,department,gender,id,job,name,profile_path,index
0,52fe4284c3a36847f8024f49,Directing,2.0,7879.0,Director,John Lasseter,/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg,0
1,52fe4284c3a36847f8024f4f,Writing,2.0,12891.0,Screenplay,Joss Whedon,/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg,0
2,52fe4284c3a36847f8024f55,Writing,2.0,7.0,Screenplay,Andrew Stanton,/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg,0
3,52fe4284c3a36847f8024f5b,Writing,2.0,12892.0,Screenplay,Joel Cohen,/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg,0
4,52fe4284c3a36847f8024f61,Writing,0.0,12893.0,Screenplay,Alec Sokolow,/v79vlRYi94BZUQnkkyznbGUZLjT.jpg,0


In [16]:
crew_result = pd.read_csv('data/ready_for_ingestion/crew.csv')

In [12]:
movies_ids = credits[["id"]].reset_index()
crew_df = pd.merge(crew_df, movies_ids, how="left", on="index")
crew_df

Unnamed: 0,credit_id,department,gender,id_x,job,name,profile_path,index,id_y
0,52fe4284c3a36847f8024f49,Directing,2.0,7879.0,Director,John Lasseter,/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg,0,862
1,52fe4284c3a36847f8024f4f,Writing,2.0,12891.0,Screenplay,Joss Whedon,/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg,0,862
2,52fe4284c3a36847f8024f55,Writing,2.0,7.0,Screenplay,Andrew Stanton,/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg,0,862
3,52fe4284c3a36847f8024f5b,Writing,2.0,12892.0,Screenplay,Joel Cohen,/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg,0,862
4,52fe4284c3a36847f8024f61,Writing,0.0,12893.0,Screenplay,Alec Sokolow,/v79vlRYi94BZUQnkkyznbGUZLjT.jpg,0,862
...,...,...,...,...,...,...,...,...,...
464309,52fe4776c3a368484e0c8399,Sound,0.0,549356.0,Original Music Composer,Richard McHugh,,45473,67758
464310,52fe4776c3a368484e0c839f,Camera,2.0,58818.0,Director of Photography,João Fernandes,,45473,67758
464311,533bccebc3a36844cf0011a7,Directing,0.0,1085341.0,Director,Yakov Protazanov,/yyjbGdCs2ZN6IlZNCfmBWyuRDlt.jpg,45474,227506
464312,58ebbc26925141281908aa0a,Production,2.0,1195656.0,Producer,Joseph N. Ermolieff,,45474,227506


In [13]:
crew_df = crew_df.rename(columns={'id_y': 'movie_id', 'id_x': 'id'})
crew_df

Unnamed: 0,credit_id,department,gender,id,job,name,profile_path,index,movie_id
0,52fe4284c3a36847f8024f49,Directing,2.0,7879.0,Director,John Lasseter,/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg,0,862
1,52fe4284c3a36847f8024f4f,Writing,2.0,12891.0,Screenplay,Joss Whedon,/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg,0,862
2,52fe4284c3a36847f8024f55,Writing,2.0,7.0,Screenplay,Andrew Stanton,/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg,0,862
3,52fe4284c3a36847f8024f5b,Writing,2.0,12892.0,Screenplay,Joel Cohen,/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg,0,862
4,52fe4284c3a36847f8024f61,Writing,0.0,12893.0,Screenplay,Alec Sokolow,/v79vlRYi94BZUQnkkyznbGUZLjT.jpg,0,862
...,...,...,...,...,...,...,...,...,...
464309,52fe4776c3a368484e0c8399,Sound,0.0,549356.0,Original Music Composer,Richard McHugh,,45473,67758
464310,52fe4776c3a368484e0c839f,Camera,2.0,58818.0,Director of Photography,João Fernandes,,45473,67758
464311,533bccebc3a36844cf0011a7,Directing,0.0,1085341.0,Director,Yakov Protazanov,/yyjbGdCs2ZN6IlZNCfmBWyuRDlt.jpg,45474,227506
464312,58ebbc26925141281908aa0a,Production,2.0,1195656.0,Producer,Joseph N. Ermolieff,,45474,227506


In [14]:
crew_df = crew_df.drop('index', axis=1)
crew_df

Unnamed: 0,credit_id,department,gender,id,job,name,profile_path,movie_id
0,52fe4284c3a36847f8024f49,Directing,2.0,7879.0,Director,John Lasseter,/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg,862
1,52fe4284c3a36847f8024f4f,Writing,2.0,12891.0,Screenplay,Joss Whedon,/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg,862
2,52fe4284c3a36847f8024f55,Writing,2.0,7.0,Screenplay,Andrew Stanton,/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg,862
3,52fe4284c3a36847f8024f5b,Writing,2.0,12892.0,Screenplay,Joel Cohen,/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg,862
4,52fe4284c3a36847f8024f61,Writing,0.0,12893.0,Screenplay,Alec Sokolow,/v79vlRYi94BZUQnkkyznbGUZLjT.jpg,862
...,...,...,...,...,...,...,...,...
464309,52fe4776c3a368484e0c8399,Sound,0.0,549356.0,Original Music Composer,Richard McHugh,,67758
464310,52fe4776c3a368484e0c839f,Camera,2.0,58818.0,Director of Photography,João Fernandes,,67758
464311,533bccebc3a36844cf0011a7,Directing,0.0,1085341.0,Director,Yakov Protazanov,/yyjbGdCs2ZN6IlZNCfmBWyuRDlt.jpg,227506
464312,58ebbc26925141281908aa0a,Production,2.0,1195656.0,Producer,Joseph N. Ermolieff,,227506


### Saving the output to dedicated catalog

In [17]:
crew_df.to_csv("data/ready_for_ingestion/crew.csv", index=False)