In [1]:
import pandas as pd

In [3]:
data = pd.read_csv("Google Drive/My Drive/Dropbox/Courses/Pandas_in_action_Manning/pandas-in-action-master/chapter_01_introducing_pandas/movies.csv")

In [5]:
# Get the first five rows of the data
data.head(5)

Unnamed: 0,Rank,Title,Studio,Gross,Year
0,1,Avengers: Endgame,Buena Vista,"$2,796.30",2019
1,2,Avatar,Fox,"$2,789.70",2009
2,3,Titanic,Paramount,"$2,187.50",1997
3,4,Star Wars: The Force Awakens,Buena Vista,"$2,068.20",2015
4,5,Avengers: Infinity War,Buena Vista,"$2,048.40",2018


In [7]:
# Get the last five rows of the data
data.tail(5)

Unnamed: 0,Rank,Title,Studio,Gross,Year
777,778,Yogi Bear,Warner Brothers,$201.60,2010
778,779,Garfield: The Movie,Fox,$200.80,2004
779,780,Cats & Dogs,Warner Brothers,$200.70,2001
780,781,The Hunt for Red October,Paramount,$200.50,1990
781,782,Valkyrie,MGM,$200.30,2008


In [15]:
# Get number of rows and columns
nrow, ncol = data.shape
print(nrow)
print(ncol)

782
5


In [22]:
# Another way to find the number of rows is the following
len(data)

782

In [11]:
# Get total number of entries
data.size

3910

In [29]:
# Find the data types for the columns
data.dtypes

Rank       int64
Studio    object
Gross     object
Year       int64
dtype: object

In [17]:
# Get a row with a certain number meaning the numeric position of the row
data.iloc[778] # This does not return 778 but 779 because unlike Julia or R, Python is zero-based

Rank                      779
Title     Garfield: The Movie
Studio                    Fox
Gross                $200.80 
Year                     2004
Name: 778, dtype: object

In [27]:
# Get a certain row based on the index - for this we need to specify the index column 
data = pd.read_csv("Google Drive/My Drive/Dropbox/Courses/Pandas_in_action_Manning/pandas-in-action-master/chapter_01_introducing_pandas/movies.csv", index_col="Title")
data.loc['Garfield: The Movie']

Rank           779
Studio         Fox
Gross     $200.80 
Year          2004
Name: Garfield: The Movie, dtype: object

In [25]:
# Get a certain column
data[['Title','Studio']].head()

Unnamed: 0,Title,Studio
0,Avengers: Endgame,Buena Vista
1,Avatar,Fox
2,Titanic,Paramount
3,Star Wars: The Force Awakens,Buena Vista
4,Avengers: Infinity War,Buena Vista


In [30]:
# Subset the dataframe, get the movies where the studio was Universal
universal_studio_movies = data[data["Studio"] == "Universal"]

In [31]:
universal_studio_movies.head()

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jurassic World,6,Universal,"$1,671.70",2015
Furious 7,8,Universal,"$1,516.00",2015
Jurassic World: Fallen Kingdom,13,Universal,"$1,309.50",2018
The Fate of the Furious,17,Universal,"$1,236.00",2017
Minions,19,Universal,"$1,159.40",2015


In [41]:
universal_studio_movies.sort_values(by = 'Gross', ascending = True)

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jurassic Park,33,Universal,"$1,029.50",1993
Despicable Me 3,32,Universal,"$1,034.80",2017
Minions,19,Universal,"$1,159.40",2015
The Fate of the Furious,17,Universal,"$1,236.00",2017
Jurassic World: Fallen Kingdom,13,Universal,"$1,309.50",2018
...,...,...,...,...
Sing,134,Universal,$634.20,2016
Fast & Furious 6,86,Universal,$788.70,2013
E.T.: The Extra-Terrestrial,84,Universal,$792.90,1982
The Secret Life of Pets,63,Universal,$875.50,2016


In [45]:
# What if we want to sort by more than one columns?
universal_studio_movies.sort_values(by = ['Gross', 'Title'])

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jurassic Park,33,Universal,"$1,029.50",1993
Despicable Me 3,32,Universal,"$1,034.80",2017
Minions,19,Universal,"$1,159.40",2015
The Fate of the Furious,17,Universal,"$1,236.00",2017
Jurassic World: Fallen Kingdom,13,Universal,"$1,309.50",2018
...,...,...,...,...
Sing,134,Universal,$634.20,2016
Fast & Furious 6,86,Universal,$788.70,2013
E.T.: The Extra-Terrestrial,84,Universal,$792.90,1982
The Secret Life of Pets,63,Universal,$875.50,2016


In [49]:
# Now what if we want to subset the dataframe based on multiple conditions?
# (e.g., movies by Universal released after 1994)

universal_studio = data['Studio'] == 'Universal'
released_after_1994 = data['Year'] > 1994
data[universal_studio & released_after_1994].head()

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jurassic World,6,Universal,"$1,671.70",2015
Furious 7,8,Universal,"$1,516.00",2015
Jurassic World: Fallen Kingdom,13,Universal,"$1,309.50",2018
The Fate of the Furious,17,Universal,"$1,236.00",2017
Minions,19,Universal,"$1,159.40",2015


In [50]:
# If we wanted instead to find movies either from Universal or released after 1994
movies_either_from_universal_or_after_1994 = data[universal_studio | released_after_1994]

In [51]:
movies_either_from_universal_or_after_1994.head()

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avengers: Endgame,1,Buena Vista,"$2,796.30",2019
Avatar,2,Fox,"$2,789.70",2009
Titanic,3,Paramount,"$2,187.50",1997
Star Wars: The Force Awakens,4,Buena Vista,"$2,068.20",2015
Avengers: Infinity War,5,Buena Vista,"$2,048.40",2018
