In [30]:
""""
This script uses a data set from kaggle. Various cleaning methods will be used on the data.

Author: Chris Worden

"""


import pandas as pd
import numpy as np 

# reading the csv from file. movies.csv can be found from 
# "https://www.kaggle.com/datasets/bharatnatrayn/movies-dataset-for-feature-extracion-prediction?select=movies.csv"
df = pd.read_csv("C:\\Users\\chris\\Downloads\\movies.csv")

df




Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,
...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,(2021– ),"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,
9995,Arcane,(2021– ),"\nAnimation, Action, Adventure",,\nAdd a Plot\n,\n,,,
9996,Heart of Invictus,(2022– ),"\nDocumentary, Sport",,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,,,
9997,The Imperfects,(2021– ),"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,


The data above has a few things that I will change. For example, a lot of the strings in columns such as "genre" contain the characters '/n'. the script will get rid of characters that provide 
no value. Also, the column stars also contains the director. I am going to create a new column soley based on the director. This way we can seperate the actors and directors. 

In [31]:
# First, we check to make sure that the character '/n' is a string. If it is a string, then we change it to ''. 
# the replace function does not handle floats 
df[['GENRE', 'ONE-LINE', 'STARS']] = df[['GENRE', 'ONE-LINE', 'STARS']].map(lambda x: x.replace('\n', '') if isinstance(x, str) else x)

# Extract director's name using regular expression
df['DIRECTOR'] = df['STARS'].str.extract(r'Director:\s*([^|]*)')

# Replace empty entries with NaN
df['DIRECTOR'].replace('', pd.NA, inplace=True)

# Remove director and "Stars:" from 'STARS' column
df['STARS'] = df['STARS'].str.replace('Stars:', '')
df['STARS'] = df['STARS'].str.replace('Star:', '')
# Remove the "Director:" substring and anything after it until the "|" character
def extract_director(stars):
    if '|' in stars:
        return stars.split('|', 1)[1].strip()
    else:
        return stars

df['STARS'] = df['STARS'].apply(extract_director)

df['STARS'] = df['STARS'].apply(extract_director)
df


Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,DIRECTOR
0,Blood Red Sky,(2021),"Action, Horror, Thriller",6.1,A woman with a mysterious illness is forced in...,"Peri Baumeister, Carl Anton Koch, Alexander Sc...",21062,121.0,,Peter Thorwarth
1,Masters of the Universe: Revelation,(2021– ),"Animation, Action, Adventure",5.0,The war for Eternia begins again in what may b...,"Chris Wood, Sarah Michelle Gel...",17870,25.0,,
2,The Walking Dead,(2010–2022),"Drama, Horror, Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,"Andrew Lincoln, Norman Reedus,...",885805,44.0,,
3,Rick and Morty,(2013– ),"Animation, Adventure, Comedy",9.2,An animated series that follows the exploits o...,"Justin Roiland, Chris Parnell,...",414849,23.0,,
4,Army of Thieves,(2021),"Action, Crime, Horror",,"A prequel, set before the events of Army of th...","Matthias Schweighöfer, Nathalie Emmanuel, Ruby...",,,,Matthias Schweighöfer
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,(2021– ),"Adventure, Drama, Fantasy",,Add a Plot,"Morgan Taylor Campbell, Chris ...",,,,
9995,Arcane,(2021– ),"Animation, Action, Adventure",,Add a Plot,,,,,
9996,Heart of Invictus,(2022– ),"Documentary, Sport",,Add a Plot,Star:Prince Harry,,,,Orlando von Einsiedel
9997,The Imperfects,(2021– ),"Adventure, Drama, Fantasy",,Add a Plot,"Morgan Taylor Campbell, Iñaki Godoy, Rhianna J...",,,,Jovanka Vuckovic


The following contains a few ways to view data based order of columns and what not. 

In [32]:
# Filter out NaN values and return the top 10 movies based on rating

top_10_movies = df.dropna(subset=['RATING']).nlargest(10, 'RATING')

print(top_10_movies)

# Could do more of this, but it becomes redundent at a certain point. 

                                   MOVIES         YEAR  \
7640                      BoJack Horseman  (2014–2020)   
8510           Avatar: The Last Airbender  (2005–2008)   
8509           Avatar: The Last Airbender  (2005–2008)   
9892                               Dexter  (2006–2013)   
7159           Avatar: The Last Airbender  (2005–2008)   
8541                  The Midnight Gospel     (2020– )   
8629   She-Ra and the Princesses of Power  (2018–2020)   
8637   She-Ra and the Princesses of Power  (2018–2020)   
8767                                 Dark  (2017–2020)   
6485           Avatar: The Last Airbender  (2005–2008)   

                                         GENRE  RATING  \
7640      Animation, Comedy, Drama                 9.9   
8510  Animation, Action, Adventure                 9.9   
8509  Animation, Action, Adventure                 9.8   
9892         Crime, Drama, Mystery                 9.8   
7159  Animation, Action, Adventure                 9.7   
8541  Animati