In [2]:
import pandas as pd
from sqlalchemy import create_engine

In [3]:
engine = create_engine('postgresql:///Source')

In [4]:
query = '''SELECT * FROM movies'''
df = pd.read_sql(query, engine)

In [5]:
df.shape

(1000, 17)

In [6]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
899,898,https://m.media-amazon.com/images/M/MV5BMTgwOD...,Dawn of the Planet of the Apes,2014,UA,130 min,"Action, Adventure, Drama",7.6,A growing nation of genetically evolved apes l...,79.0,Matt Reeves,Gary Oldman,Keri Russell,Andy Serkis,Kodi Smit-McPhee,411599,208545589.0
758,757,https://m.media-amazon.com/images/M/MV5BOTY4NT...,Frost/Nixon,2008,R,122 min,"Biography, Drama, History",7.7,A dramatic retelling of the post-Watergate tel...,80.0,Ron Howard,Frank Langella,Michael Sheen,Kevin Bacon,Sam Rockwell,103330,18593156.0
198,197,https://m.media-amazon.com/images/M/MV5BZGRkOG...,Koe no katachi,2016,16,130 min,"Animation, Drama, Family",8.1,A young man is ostracized by his classmates af...,78.0,Naoko Yamada,Miyu Irino,Saori Hayami,Aoi Yûki,Kenshô Ono,47708,
966,966,https://m.media-amazon.com/images/M/MV5BNjEzYj...,Apollo 13,PG,U,140 min,"Adventure, Drama, History",7.6,NASA must devise a strategy to return Apollo 1...,77.0,Ron Howard,Tom Hanks,Bill Paxton,Kevin Bacon,Gary Sinise,269197,173837933.0
621,617,https://m.media-amazon.com/images/M/MV5BYWUxZj...,Once,2007,R,86 min,"Drama, Music, Romance",7.8,A modern-day musical about a busker and an imm...,88.0,John Carney,Glen Hansard,Markéta Irglová,Hugh Walsh,Gerard Hendrick,110656,9439923.0


# Data Restructuring

1. Break up comma-separated values into separate rows.

In [7]:
for col in df:
    if df[col].astype(str).str.contains(',').any():
        print (col)

Poster_Link
Series_Title
Genre
Overview
Gross


In [8]:
df[['Poster_Link', 'Series_Title', 'Overview', 'Gross']].head(5)

Unnamed: 0,Poster_Link,Series_Title,Overview,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,Two imprisoned men bond over a number of years...,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,An organized crime dynasty's aging patriarch t...,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,When the menace known as the Joker wreaks havo...,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,The early life and career of Vito Corleone in ...,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,A jury holdout attempts to prevent a miscarria...,4360000


Investigation shows that Genre is the only column with multiple values, all other columns have commas for acceptable reasons, therefore value separation will be done for the Genre column alone.

In [9]:
df['Genre'] = df['Genre'].str.split(',')
df = df.explode('Genre')
# It good practice to additionally strip the exploded column, this removes whitespaces that can create fake duplicates.
# For example 'Drama' and ' Drama' won't be detected as duplicates and will cause issues later in the pipeline.
df['Genre'] = df['Genre'].str.strip()

In [10]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,Crime,9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
1,1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,Drama,9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,Action,9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
2,2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,Crime,9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
2,2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,Drama,9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,Crime,9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
3,3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,Drama,9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,Crime,9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
4,4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,Drama,9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


2. Check for repeating columns of the same attribute.

Printing the list of columns alphabetically makes this easier to spot.

In [11]:
df.dtypes.sort_index()

Certificate       object
Director          object
Genre             object
Gross             object
IMDB_Rating      float64
Meta_score       float64
No_of_Votes        int64
Overview          object
Poster_Link       object
Released_Year     object
Runtime           object
Series_Title      object
Star1             object
Star2             object
Star3             object
Star4             object
Unnamed: 0         int64
dtype: object

It can be seen that the Star attribute is repeating, therefore break up the Star1, Star2, Star3, and Star4 columns into separate rows.

In [12]:
df = pd.melt(df,
        id_vars = ['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director', 'No_of_Votes', 'Gross'],
        value_vars = ['Star1', 'Star2', 'Star3', 'Star4'],
        var_name = 'Actor',
        value_name = 'Actor_Name').sort_values('Series_Title', ascending=True)

pd.melt() is useful if you need to define your variable and value column names manually.

id_vars = a list of all the attributes to be ignored in the melt() action, it is mandatory if those attributes need to appear in the final table.

value_vars = a list of all the attributes to consider in the melt() action.

var_name = column name for all the attributes that will now be broken up and housed under one column.

value_name = column name for the values of the broken up attributes.

In [13]:
df.head(10)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,No_of_Votes,Gross,Actor,Actor_Name
9535,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,Drama,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374,Star4,Chloë Grace Moretz
6993,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,Comedy,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374,Star3,Geoffrey Arend
6994,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,Drama,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374,Star3,Geoffrey Arend
6995,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,Romance,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374,Star3,Geoffrey Arend
1912,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,Drama,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374,Star1,Zooey Deschanel
1911,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,Comedy,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374,Star1,Zooey Deschanel
9536,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,Romance,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374,Star4,Chloë Grace Moretz
4454,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,Romance,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374,Star2,Joseph Gordon-Levitt
4453,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,Drama,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374,Star2,Joseph Gordon-Levitt
4452,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,Comedy,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374,Star2,Joseph Gordon-Levitt


After the column separation, the Actor column seems useless and may be deleted if necessary.

# Data Cleaning

1. Get rid of duplicated rows, if any.

In [14]:
df = df.drop_duplicates()

2. Get rid of duplicated columns if any. A previous alphabetical printout of the column list already showed no duplicates.

3. Fill all null values: In an automated pipeline the scripts are designed to deal with whatever issues may arise. Therefore there is no need to check for specific cases of nulls before filling them. Instead we determine the data type of each column and decide what to fill any nulls with. This ties in with the next transformation step which is...

# Data Validation

1. Here we verify that values under a column fit the column's data type. Together with the previous step the script can be combined and designed as:
- decide each column's data type
- convert to decided data types if needed
- fill the nulls under each column with appropriate defaults
- flag any values that don't match the column data type

In [19]:
# Create a mapping of the decided data type for each column:
schema = {
            'Poster_Link': 'string',
            'Series_Title': 'string',
            'Released_Year': 'Int64',
            'Certificate': 'string',
            'Runtime': 'string',
            'Genre': 'string',
            'IMDB_Rating': 'float',
            'Overview': 'string',
            'Meta_score': 'float',
            'Director': 'string',
            'No_of_Votes': 'Int64',
            'Gross': 'Int64'
        }

In [15]:
# The Gross column is currently saved as a string data type, since we intend to convert it to
# an integer any commas must first be removed, otherwise it will throw an error.

df['Gross'] = df['Gross'].str.replace(',', '')

In [16]:
# Next convert each column to its preferred data type and flag an values not matching the data type (errors='coerce')

for col, dtype in schema.items():
    if dtype == 'Int64':
        df[col] = pd.to_numeric(df[col], errors="coerce").astype('Int64')
    elif dtype == 'float':
        df[col] = pd.to_numeric(df[col], errors="coerce")
    elif dtype == 'datetime':
        df[col] = pd.to_datetime(df[col], errors="coerce")

In [17]:
# For efficiency group columns by dtype
str_cols = df.select_dtypes(include='object').columns
num_cols = df.select_dtypes(include='number').columns
bool_cols = df.select_dtypes(include='bool').columns

# Apply default fills for the data type groups
df[num_cols] = df[num_cols].fillna(0)
df[str_cols] = df[str_cols].fillna('NA')
df[bool_cols] = df[bool_cols].fillna(False)

2. This is the point to check and address date format inconsistencies, if any e.g. 2025-10-29 vs 10/29/2025, correcting impossible dates (e.g. 2025-02-30) etc.

This would normally fall under the data cleaning step but it is easier to run it at this step after data types have been confirmed, all error-values flagged, and all nulls have been filled.

3. Next, verify the logic of existing dates, if any e.g. checking that end_date ≥ start_date, that all dates fall within a valid range, that timestamps match expected formats etc.

4. Identify and separate the entities into their respective groups

In [20]:
movies = df[['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'No_of_Votes', 'Gross']]
movies.head(5)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,IMDB_Rating,Overview,Meta_score,Director,No_of_Votes,Gross
9535,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374
6993,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374
6994,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374
6995,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374
1912,https://m.media-amazon.com/images/M/MV5BMTk5Mj...,(500) Days of Summer,2009,UA,95 min,7.7,An offbeat romantic comedy about a woman who d...,76.0,Marc Webb,472242,32391374


Next, ensure that there are no duplicated business keys. Here the business keys will be Series_Title and Released_Year.

In [24]:
movies = movies.drop_duplicates(subset=['Series_Title', 'Released_Year'], keep='first')

Finally, repeat the above step for each of the entity groups before loading them to their tables.

# Observation

After business key duplicates were removed from the movie dataset, there wwas still a duplicate found under the primary key (Series_Title), however if the business key doesn't confirm duplicates then such findings are not real duplicates rather data entry issues.

In [26]:
movies['Series_Title'][movies['Series_Title'].duplicated(keep=False)]

2760    Drishyam
2883    Drishyam
Name: Series_Title, dtype: object

In [27]:
movies[movies['Series_Title'] == 'Drishyam']

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,IMDB_Rating,Overview,Meta_score,Director,No_of_Votes,Gross
2760,https://m.media-amazon.com/images/M/MV5BYmY3Mz...,Drishyam,2013,U,160 min,8.3,A man goes to extreme lengths to save his fami...,50.0,Jeethu Joseph,30722,
2883,https://m.media-amazon.com/images/M/MV5BYmJhZm...,Drishyam,2015,UA,163 min,8.2,Desperate measures are taken by a man who trie...,50.0,Nishikant Kamat,70367,739478.0
