PANDAS Package

In [None]:
#  !pip install pandas

In Pandas, there are two main data structures - Series and Dataframes.

A Series is a one-dimensional array-like object that can hold any data type — be it integers, floats, strings, or even Python objects.
It is essentially a column in an Excel sheet or a field in a SQL table. Along with the data, a Series also includes an index.

In [1]:
import pandas as pd

In [7]:
s = pd.Series([1,2,3,4,5])
print(s)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [9]:
s2 = pd.Series([1,2,3,4,5], index = ['a', 'b', 'c', 'd', 'e'])
print(s2)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [11]:
s3 = pd.Series({'a': 1, 'b': 2, 'c': 3})
print(s3)

a    1
b    2
c    3
dtype: int64


In [13]:
# access 1st in original Series s
print("1st element:", s[0])

1st element: 1


Dataframes

In [16]:
df = pd.DataFrame({
    "Name": ["Alice", "Bob", "Chris", "Dave", "Edgar", "Francis", "George"],
    "Age": [25, 30, 35, 28, 31, 38, 28],
    "Occupation": ["Engineer", "Doctor", "Artist", "Doctor", "Engineer", "Doctor", "Artist"]
})
print(df)

      Name  Age Occupation
0    Alice   25   Engineer
1      Bob   30     Doctor
2    Chris   35     Artist
3     Dave   28     Doctor
4    Edgar   31   Engineer
5  Francis   38     Doctor
6   George   28     Artist


In [18]:
# access a dataframe column using its column name
#in Pandas, print Name column:
df["Name"]

0      Alice
1        Bob
2      Chris
3       Dave
4      Edgar
5    Francis
6     George
Name: Name, dtype: object

In [20]:
df["Age"]

0    25
1    30
2    35
3    28
4    31
5    38
6    28
Name: Age, dtype: int64

In [22]:
#access row value by label
# use .loc[]
print(df.loc[0])

Name             Alice
Age                 25
Occupation    Engineer
Name: 0, dtype: object


In [24]:
#access row value by index
#use .iloc[]
print(df.iloc[2])

Name           Chris
Age               35
Occupation    Artist
Name: 2, dtype: object


In [26]:
#Filtering
#get rows where Age is greater than 28
filtered_df = df[df["Age"] > 28]
print(filtered_df)

      Name  Age Occupation
1      Bob   30     Doctor
2    Chris   35     Artist
4    Edgar   31   Engineer
5  Francis   38     Doctor


In [28]:
filtered_df2 = df[df["Occupation"] == "Doctor"]
print(filtered_df2)

      Name  Age Occupation
1      Bob   30     Doctor
3     Dave   28     Doctor
5  Francis   38     Doctor


In [30]:
#filter using AND operator &
fil_df = df[(df["Occupation"] == "Doctor") & (df["Age"] >= 30)]
print(fil_df)

      Name  Age Occupation
1      Bob   30     Doctor
5  Francis   38     Doctor


In [32]:
#filter using OR operator |
fil_df2 = df[(df["Occupation"] == "Doctor") | (df["Age"] >= 30)]
print(fil_df2)

      Name  Age Occupation
1      Bob   30     Doctor
2    Chris   35     Artist
3     Dave   28     Doctor
4    Edgar   31   Engineer
5  Francis   38     Doctor


In [34]:
#add a column to our dataframe
df["Salary"] = [90000, 70000, 80000, 75000, 95000, 100000, 75000]

In [36]:
df

Unnamed: 0,Name,Age,Occupation,Salary
0,Alice,25,Engineer,90000
1,Bob,30,Doctor,70000
2,Chris,35,Artist,80000
3,Dave,28,Doctor,75000
4,Edgar,31,Engineer,95000
5,Francis,38,Doctor,100000
6,George,28,Artist,75000


In [38]:
#drop a column from a dataframe
#axis = 1 will designate a column, versus axis = 0 specifies rows
df = df.drop("Salary", axis = 1)

In [40]:
df

Unnamed: 0,Name,Age,Occupation
0,Alice,25,Engineer
1,Bob,30,Doctor
2,Chris,35,Artist
3,Dave,28,Doctor
4,Edgar,31,Engineer
5,Francis,38,Doctor
6,George,28,Artist


In [42]:
#sorting df
sorted_df = df.sort_values(by = "Age", ascending = False)
print(sorted_df)

      Name  Age Occupation
5  Francis   38     Doctor
2    Chris   35     Artist
4    Edgar   31   Engineer
1      Bob   30     Doctor
3     Dave   28     Doctor
6   George   28     Artist
0    Alice   25   Engineer


In [44]:
# use group by for aggregation to find average age by occupation
grouped_df_age = df.groupby("Occupation")["Age"].mean()
print(grouped_df_age)

Occupation
Artist      31.5
Doctor      32.0
Engineer    28.0
Name: Age, dtype: float64


In [46]:
#Measures of Central Tendency
#average
mean_val = df["Age"].mean()
#median - center value in a list when ordered ascending
median_val = df["Age"].median()
#mode - the most common value occuring in our column
mode_val = df["Age"].mode()[0]

print(f"Mean: {mean_val}")
print(f"Median: {median_val}")
print(f"Mode: {mode_val}")

Mean: 30.714285714285715
Median: 30.0
Mode: 28


In [48]:
#variation 
#Variance measures variability from the average or mean
variance_val = df["Age"].var()
#Standard deviation - a statistical measure that quantifies the amount of variation or dispersion of a set of data values around its mean (average)
std_dev_val = df["Age"].std()
# Range is the difference between the highest and lowest values in a dataset.
range_val = df["Age"].max() - df["Age"].min()

#IQR
#The Interquartile Range (IQR) is a measure of statistical dispersion, representing the spread of the middle 50% of a dataset. 
# It's calculated as the difference between the third quartile (Q3) and the first quartile (Q1) of a dataset.

# Calculate the first quartile (Q1) -  value below which 25% of the data points fall when the data is arranged in ascending order
q1 = df["Age"].quantile(0.25)

# Calculate the third quartile (Q3) -  value below which 75% of the data points fall when the data is arranged in ascending order
q3 = df["Age"].quantile(0.75)

#Calculate the IQR
iqr = q3 - q1

print(f"Variance: {variance_val}")
print(f"Standard Deviation: {std_dev_val}")
print(f"Range: {range_val}")
print(f"IQR: {iqr}")

Variance: 19.904761904761905
Standard Deviation: 4.46147530585589
Range: 13
IQR: 5.0


Import dataset


In [53]:
#import csv
df = pd.read_csv("movies.csv", encoding = "utf-8")

In [55]:
df


Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts..."
1,1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...
3,3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,..."
4,4,12 Angry Men,1957,96,9.0,97.0,4.36,824211,The jury in a New York City murder trial is fr...
...,...,...,...,...,...,...,...,...,...
995,995,Philomena,2013,98,7.6,77.0,37.71,102336,A world-weary political journalist picks up th...
996,996,Un long dimanche de fiançailles,2004,133,7.6,76.0,6.17,75004,Tells the story of a young woman's relentless ...
997,997,Shine,1996,105,7.6,87.0,35.81,55589,"Pianist David Helfgott, driven by his father a..."
998,998,The Invisible Man,1933,71,7.6,87.0,,37822,"A scientist finds a way of becoming invisible,..."


In [None]:
# import Excel file
!pip install openpyxl

In [54]:
df = pd.read_excel("movies.xlsx", engine = "openpyxl")

In [56]:
#see top 5 rows
df.head()

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts..."
1,1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...
3,3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,..."
4,4,12 Angry Men,1957,96,9.0,97.0,4.36,824211,The jury in a New York City murder trial is fr...


In [58]:
#see top 10 rows
df.head(10)

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts..."
1,1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...
3,3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,..."
4,4,12 Angry Men,1957,96,9.0,97.0,4.36,824211,The jury in a New York City murder trial is fr...
5,5,The Lord of the Rings: The Return of the King,2003,201,9.0,94.0,377.85,1904166,Gandalf and Aragorn lead the World of Men agai...
6,6,The Godfather Part II,1974,202,9.0,90.0,57.3,1314609,The early life and career of Vito Corleone in ...
7,7,Spider-Man: Across the Spider-Verse,2023,140,8.9,86.0,15.0,198031,"Miles Morales catapults across the Multiverse,..."
8,8,Pulp Fiction,1994,154,8.9,95.0,107.93,2131189,"The lives of two mob hitmen, a boxer, a gangst..."
9,9,Inception,2010,148,8.8,74.0,292.58,2444816,A thief who steals corporate secrets through t...


In [60]:
#look at last 5 rows
df.tail()

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
995,995,Philomena,2013,98,7.6,77.0,37.71,102336,A world-weary political journalist picks up th...
996,996,Un long dimanche de fianÃ§ailles,2004,133,7.6,76.0,6.17,75004,Tells the story of a young woman's relentless ...
997,997,Shine,1996,105,7.6,87.0,35.81,55589,"Pianist David Helfgott, driven by his father a..."
998,998,The Invisible Man,1933,71,7.6,87.0,,37822,"A scientist finds a way of becoming invisible,..."
999,999,Celda 211,2009,113,7.6,,,69464,The story of two men on different sides of a p...


In [62]:
#information about each column - number of rows with real values and type of data in each row
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          1000 non-null   int64  
 1   Movie Name          1000 non-null   object 
 2   Year of Release     1000 non-null   object 
 3   Watch Time          1000 non-null   int64  
 4   Movie Rating        1000 non-null   float64
 5   Metascore of movie  845 non-null    float64
 6   Gross               838 non-null    object 
 7   Votes               1000 non-null   object 
 8   Description         1000 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 70.4+ KB


In [64]:
#get statistical information about the numeric columns
df.describe()

Unnamed: 0.1,Unnamed: 0,Watch Time,Movie Rating,Metascore of movie
count,1000.0,1000.0,1000.0,845.0
mean,499.5,124.253,7.9702,79.011834
std,288.819436,28.800355,0.275732,11.9738
min,0.0,45.0,7.6,28.0
25%,249.75,103.0,7.8,71.0
50%,499.5,120.0,7.9,80.0
75%,749.25,139.0,8.1,88.0
max,999.0,321.0,9.3,100.0


In [66]:
#check for missing values in df
df.isnull()

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,True,False,False


In [68]:
#also use to find missing values:
df.isna()

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,True,False,False


In [70]:
#find the number of null values in each column
df.isnull().sum()

Unnamed: 0              0
Movie Name              0
Year of Release         0
Watch Time              0
Movie Rating            0
Metascore of movie    155
Gross                 162
Votes                   0
Description             0
dtype: int64

In [72]:
#duplicate rows in whole df
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [74]:
#count number of duplicate rows in whole df
df.duplicated().sum()

0

In [76]:
#count how many rows are duplicates in a single column
df["Year of Release"].duplicated().sum()

877

In [78]:
#see how often each value appears in a column.  For example, how many movies from each year?
df["Year of Release"].value_counts()

Year of Release
2014        28
2004        28
2013        26
2001        25
2019        25
            ..
1922         1
1930         1
1920         1
1926         1
III 2018     1
Name: count, Length: 123, dtype: int64

In [80]:
#to grab a single column
df["Movie Name"]

0              The Shawshank Redemption
1                         The Godfather
2                       The Dark Knight
3                      Schindler's List
4                          12 Angry Men
                     ...               
995                           Philomena
996    Un long dimanche de fianÃ§ailles
997                               Shine
998                   The Invisible Man
999                           Celda 211
Name: Movie Name, Length: 1000, dtype: object

In [82]:
#more than one column, such as 2 columns
df[["Movie Name", "Watch Time"]]

Unnamed: 0,Movie Name,Watch Time
0,The Shawshank Redemption,142
1,The Godfather,175
2,The Dark Knight,152
3,Schindler's List,195
4,12 Angry Men,96
...,...,...
995,Philomena,98
996,Un long dimanche de fianÃ§ailles,133
997,Shine,105
998,The Invisible Man,71


In [84]:
# drop Unnamed: 0 column
df = df.drop("Unnamed: 0", axis = 1)

In [86]:
df


Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts..."
1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...
3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,..."
4,12 Angry Men,1957,96,9.0,97.0,4.36,824211,The jury in a New York City murder trial is fr...
...,...,...,...,...,...,...,...,...
995,Philomena,2013,98,7.6,77.0,37.71,102336,A world-weary political journalist picks up th...
996,Un long dimanche de fianÃ§ailles,2004,133,7.6,76.0,6.17,75004,Tells the story of a young woman's relentless ...
997,Shine,1996,105,7.6,87.0,35.81,55589,"Pianist David Helfgott, driven by his father a..."
998,The Invisible Man,1933,71,7.6,87.0,,37822,"A scientist finds a way of becoming invisible,..."


In [90]:
#sort movies by title by alphabet descending
sort_desc_df = df.sort_values("Movie Rating", ascending = False)
print(sort_desc_df)

                   Movie Name Year of Release  Watch Time  Movie Rating  \
0    The Shawshank Redemption            1994         142           9.3   
1               The Godfather            1972         175           9.2   
2             The Dark Knight            2008         152           9.0   
3            Schindler's List            1993         195           9.0   
4                12 Angry Men            1957          96           9.0   
..                        ...             ...         ...           ...   
952           My Cousin Vinny            1992         120           7.6   
951             Kung Fu Panda            2008          92           7.6   
950        Hell or High Water         II 2016         102           7.6   
949           Minority Report            2002         145           7.6   
999                 Celda 211            2009         113           7.6   

     Metascore of movie   Gross      Votes  \
0                  82.0   28.34  27,77,378   
1      

In [7]:
#unique values of Movie Ratings
unique_ratings = df["Movie Rating"].unique()
unique_ratings

array([9.3, 9.2, 9. , 8.9, 8.8, 8.7, 8.6, 8.5, 8.4, 8.3, 8.2, 8.1, 8. ,
       7.9, 7.8, 7.7, 7.6])

In [9]:
#number of unique ratings
num_ratings = df["Movie Rating"].nunique()
num_ratings

17

In [None]:
#FILTERING

In [11]:
#filter rows where Watch Time column has values greater than 100
df1 = df[df["Watch Time"] > 100]
df1

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts..."
1,1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...
3,3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,..."
5,5,The Lord of the Rings: The Return of the King,2003,201,9.0,94.0,377.85,1904166,Gandalf and Aragorn lead the World of Men agai...
...,...,...,...,...,...,...,...,...,...
993,993,The Taking of Pelham One Two Three,1974,104,7.6,68.0,2.49,33299,Four armed men hijack a New York City subway c...
994,994,Control,2007,122,7.6,78.0,0.87,67244,"A profile of Ian Curtis, the enigmatic singer ..."
996,996,Un long dimanche de fiançailles,2004,133,7.6,76.0,6.17,75004,Tells the story of a young woman's relentless ...
997,997,Shine,1996,105,7.6,87.0,35.81,55589,"Pianist David Helfgott, driven by his father a..."


In [13]:
#filter rows where Watch Time column has values greater than 100  AND where the Movie Rating is less than 8
df2 = df[(df["Watch Time"] >100) & (df["Movie Rating"] < 8)]
df2

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
472,472,Puss in Boots: The Last Wish,2022,102,7.9,73.0,168.46,146066,When Puss in Boots discovers that his passion ...
473,473,Titanic,1997,194,7.9,75.0,659.33,1238632,A seventeen-year-old aristocrat falls in love ...
474,474,Zack Snyder's Justice League,2021,242,7.9,54.0,,420074,Determined to ensure that Superman's ultimate ...
475,475,Avatar,2009,162,7.9,83.0,760.51,1355689,A paraplegic Marine dispatched to the moon Pan...
476,476,Arrival,II 2016,116,7.9,81.0,100.55,728588,A linguist works with the military to communic...
...,...,...,...,...,...,...,...,...,...
993,993,The Taking of Pelham One Two Three,1974,104,7.6,68.0,2.49,33299,Four armed men hijack a New York City subway c...
994,994,Control,2007,122,7.6,78.0,0.87,67244,"A profile of Ian Curtis, the enigmatic singer ..."
996,996,Un long dimanche de fiançailles,2004,133,7.6,76.0,6.17,75004,Tells the story of a young woman's relentless ...
997,997,Shine,1996,105,7.6,87.0,35.81,55589,"Pianist David Helfgott, driven by his father a..."


In [15]:
#filter rows where Watch Time column has values greater than 100  OR where the Movie Rating is less than 8
df3 = df[(df["Watch Time"] >100) | (df["Movie Rating"] < 8)]
df3

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts..."
1,1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...
3,3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,..."
5,5,The Lord of the Rings: The Return of the King,2003,201,9.0,94.0,377.85,1904166,Gandalf and Aragorn lead the World of Men agai...
...,...,...,...,...,...,...,...,...,...
995,995,Philomena,2013,98,7.6,77.0,37.71,102336,A world-weary political journalist picks up th...
996,996,Un long dimanche de fiançailles,2004,133,7.6,76.0,6.17,75004,Tells the story of a young woman's relentless ...
997,997,Shine,1996,105,7.6,87.0,35.81,55589,"Pianist David Helfgott, driven by his father a..."
998,998,The Invisible Man,1933,71,7.6,87.0,,37822,"A scientist finds a way of becoming invisible,..."


In [25]:
#GROUPBY
#average of Movie ratings for each year
year_ratings = df.groupby("Year of Release")["Movie Rating"].median()
year_ratings

Year of Release
1920        8.00
1921        8.30
1922        7.90
1924        8.20
1925        8.00
            ... 
II 2016     7.75
II 2018     8.50
II 2022     7.70
III 2016    8.10
III 2018    7.60
Name: Movie Rating, Length: 123, dtype: float64

In [27]:
#remove rows with any missing values
df1 = df1.dropna()

In [29]:
df1

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts..."
1,1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...
3,3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,..."
5,5,The Lord of the Rings: The Return of the King,2003,201,9.0,94.0,377.85,1904166,Gandalf and Aragorn lead the World of Men agai...
...,...,...,...,...,...,...,...,...,...
992,992,21 Grams,2003,124,7.6,70.0,16.29,241941,A freak accident brings together a critically ...
993,993,The Taking of Pelham One Two Three,1974,104,7.6,68.0,2.49,33299,Four armed men hijack a New York City subway c...
994,994,Control,2007,122,7.6,78.0,0.87,67244,"A profile of Ian Curtis, the enigmatic singer ..."
996,996,Un long dimanche de fiançailles,2004,133,7.6,76.0,6.17,75004,Tells the story of a young woman's relentless ...


In [31]:
#Remove columns that have missing values
df2.dropna(axis = 1)

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Votes,Description
472,472,Puss in Boots: The Last Wish,2022,102,7.9,146066,When Puss in Boots discovers that his passion ...
473,473,Titanic,1997,194,7.9,1238632,A seventeen-year-old aristocrat falls in love ...
474,474,Zack Snyder's Justice League,2021,242,7.9,420074,Determined to ensure that Superman's ultimate ...
475,475,Avatar,2009,162,7.9,1355689,A paraplegic Marine dispatched to the moon Pan...
476,476,Arrival,II 2016,116,7.9,728588,A linguist works with the military to communic...
...,...,...,...,...,...,...,...
993,993,The Taking of Pelham One Two Three,1974,104,7.6,33299,Four armed men hijack a New York City subway c...
994,994,Control,2007,122,7.6,67244,"A profile of Ian Curtis, the enigmatic singer ..."
996,996,Un long dimanche de fiançailles,2004,133,7.6,75004,Tells the story of a young woman's relentless ...
997,997,Shine,1996,105,7.6,55589,"Pianist David Helfgott, driven by his father a..."


In [33]:
#Fill in missing values
df4 = df

In [35]:
#fill in all missing values in the dataframe with "Unknown"
df4.fillna("Unknown")

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts..."
1,1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...
3,3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,..."
4,4,12 Angry Men,1957,96,9.0,97.0,4.36,824211,The jury in a New York City murder trial is fr...
...,...,...,...,...,...,...,...,...,...
995,995,Philomena,2013,98,7.6,77.0,37.71,102336,A world-weary political journalist picks up th...
996,996,Un long dimanche de fiançailles,2004,133,7.6,76.0,6.17,75004,Tells the story of a young woman's relentless ...
997,997,Shine,1996,105,7.6,87.0,35.81,55589,"Pianist David Helfgott, driven by his father a..."
998,998,The Invisible Man,1933,71,7.6,87.0,Unknown,37822,"A scientist finds a way of becoming invisible,..."


In [37]:
df5 = df

In [39]:
df5["Gross"] = df5["Gross"].fillna(0)
df5

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts..."
1,1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...
3,3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,..."
4,4,12 Angry Men,1957,96,9.0,97.0,4.36,824211,The jury in a New York City murder trial is fr...
...,...,...,...,...,...,...,...,...,...
995,995,Philomena,2013,98,7.6,77.0,37.71,102336,A world-weary political journalist picks up th...
996,996,Un long dimanche de fiançailles,2004,133,7.6,76.0,6.17,75004,Tells the story of a young woman's relentless ...
997,997,Shine,1996,105,7.6,87.0,35.81,55589,"Pianist David Helfgott, driven by his father a..."
998,998,The Invisible Man,1933,71,7.6,87.0,0,37822,"A scientist finds a way of becoming invisible,..."


In [41]:
#Fill missing values in column with a calculated value:
df5["Metascore of movie"] = df5["Metascore of movie"].fillna(df5["Metascore of movie"].mean())
df5

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,0,The Shawshank Redemption,1994,142,9.3,82.000000,28.34,2777378,"Over the course of several years, two convicts..."
1,1,The Godfather,1972,175,9.2,100.000000,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,2,The Dark Knight,2008,152,9.0,84.000000,534.86,2754087,When the menace known as the Joker wreaks havo...
3,3,Schindler's List,1993,195,9.0,95.000000,96.9,1397886,"In German-occupied Poland during World War II,..."
4,4,12 Angry Men,1957,96,9.0,97.000000,4.36,824211,The jury in a New York City murder trial is fr...
...,...,...,...,...,...,...,...,...,...
995,995,Philomena,2013,98,7.6,77.000000,37.71,102336,A world-weary political journalist picks up th...
996,996,Un long dimanche de fiançailles,2004,133,7.6,76.000000,6.17,75004,Tells the story of a young woman's relentless ...
997,997,Shine,1996,105,7.6,87.000000,35.81,55589,"Pianist David Helfgott, driven by his father a..."
998,998,The Invisible Man,1933,71,7.6,87.000000,0,37822,"A scientist finds a way of becoming invisible,..."


In [61]:
df6 = df.copy()



In [63]:
#Fill missing values in column with the most frequent value (good for categorical coluns):
df6["Gross"] = df6["Gross"].fillna(df6["Gross"].mode()[0])
df6

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
0,0,The Shawshank Redemption,1994,142,9.3,82.0,28.34,2777378,"Over the course of several years, two convicts..."
1,1,The Godfather,1972,175,9.2,100.0,134.97,1933588,"Don Vito Corleone, head of a mafia family, dec..."
2,2,The Dark Knight,2008,152,9.0,84.0,534.86,2754087,When the menace known as the Joker wreaks havo...
3,3,Schindler's List,1993,195,9.0,95.0,96.9,1397886,"In German-occupied Poland during World War II,..."
4,4,12 Angry Men,1957,96,9.0,97.0,4.36,824211,The jury in a New York City murder trial is fr...
...,...,...,...,...,...,...,...,...,...
995,995,Philomena,2013,98,7.6,77.0,37.71,102336,A world-weary political journalist picks up th...
996,996,Un long dimanche de fiançailles,2004,133,7.6,76.0,6.17,75004,Tells the story of a young woman's relentless ...
997,997,Shine,1996,105,7.6,87.0,35.81,55589,"Pianist David Helfgott, driven by his father a..."
998,998,The Invisible Man,1933,71,7.6,87.0,0.01,37822,"A scientist finds a way of becoming invisible,..."


In [71]:
#top 20 highest gross films
top_films = df.sort_values(by="Movie Name", ascending = False).head(20)
top_films

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Gross,Votes,Description
301,301,Ôkami kodomo no Ame to Yuki,2012,117,8.1,71.0,0.01,47386,After her werewolf lover unexpectedly dies in ...
882,882,À bout de souffle,1960,90,7.7,,0.34,85255,A small-time thief steals a car and impulsivel...
878,878,Zwartboek,2006,145,7.7,71.0,4.4,78822,In the Nazi-occupied Netherlands during World ...
869,869,Zulu,1964,138,7.7,77.0,0.01,41266,Outnumbered British soldiers do battle with Zu...
361,361,Zootopia,2016,108,8.0,78.0,341.27,522567,"In a city of anthropomorphic animals, a rookie..."
...,...,...,...,...,...,...,...,...,...
98,98,2001: A Space Odyssey,1968,149,8.3,84.0,56.95,692341,After uncovering a mysterious artifact buried ...
150,150,1917,2019,119,8.2,78.0,159.23,633431,"April 6th, 1917. As an infantry battalion asse..."
251,251,12 Years a Slave,2013,134,8.1,96.0,56.67,721010,"In the antebellum United States, Solomon North..."
4,4,12 Angry Men,1957,96,9.0,97.0,4.36,824211,The jury in a New York City murder trial is fr...
