In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("/Volumes/GoogleDrive/My Drive/Dropbox/Courses/Pandas_in_action_Manning/pandas-in-action-master/chapter_01_introducing_pandas/movies.csv")

In [3]:
# Get the first five rows of the data
data.head(5)

Unnamed: 0,Rank,Title,Studio,Gross,Year
0,1,Avengers: Endgame,Buena Vista,"$2,796.30",2019
1,2,Avatar,Fox,"$2,789.70",2009
2,3,Titanic,Paramount,"$2,187.50",1997
3,4,Star Wars: The Force Awakens,Buena Vista,"$2,068.20",2015
4,5,Avengers: Infinity War,Buena Vista,"$2,048.40",2018


In [4]:
# Get the last five rows of the data
data.tail(5)

Unnamed: 0,Rank,Title,Studio,Gross,Year
777,778,Yogi Bear,Warner Brothers,$201.60,2010
778,779,Garfield: The Movie,Fox,$200.80,2004
779,780,Cats & Dogs,Warner Brothers,$200.70,2001
780,781,The Hunt for Red October,Paramount,$200.50,1990
781,782,Valkyrie,MGM,$200.30,2008


In [5]:
# Get number of rows and columns
nrow, ncol = data.shape
print(nrow)
print(ncol)

782
5


In [6]:
# Another way to find the number of rows is the following
len(data)

782

In [7]:
# Get total number of entries
data.size

3910

In [8]:
# Find the data types for the columns
data.dtypes

Rank       int64
Title     object
Studio    object
Gross     object
Year       int64
dtype: object

In [9]:
# Get a row with a certain number meaning the numeric position of the row
data.iloc[778] # This does not return 778 but 779 because unlike Julia or R, Python is zero-based

Rank                      779
Title     Garfield: The Movie
Studio                    Fox
Gross                $200.80 
Year                     2004
Name: 778, dtype: object

In [10]:
# Get a certain row based on the index - for this we need to specify the index column 
data = pd.read_csv("/Volumes/GoogleDrive/My Drive/Dropbox/Courses/Pandas_in_action_Manning/pandas-in-action-master/chapter_01_introducing_pandas/movies.csv", index_col="Title")
data.loc['Garfield: The Movie']

Rank           779
Studio         Fox
Gross     $200.80 
Year          2004
Name: Garfield: The Movie, dtype: object

In [11]:
# Get a certain column
data.head()

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avengers: Endgame,1,Buena Vista,"$2,796.30",2019
Avatar,2,Fox,"$2,789.70",2009
Titanic,3,Paramount,"$2,187.50",1997
Star Wars: The Force Awakens,4,Buena Vista,"$2,068.20",2015
Avengers: Infinity War,5,Buena Vista,"$2,048.40",2018


In [12]:
# Subset the dataframe, get the movies where the studio was Universal
universal_studio_movies = data[data["Studio"] == "Universal"]

In [13]:
universal_studio_movies.head()

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jurassic World,6,Universal,"$1,671.70",2015
Furious 7,8,Universal,"$1,516.00",2015
Jurassic World: Fallen Kingdom,13,Universal,"$1,309.50",2018
The Fate of the Furious,17,Universal,"$1,236.00",2017
Minions,19,Universal,"$1,159.40",2015


In [14]:
universal_studio_movies.sort_values(by = 'Gross', ascending = True)

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jurassic Park,33,Universal,"$1,029.50",1993
Despicable Me 3,32,Universal,"$1,034.80",2017
Minions,19,Universal,"$1,159.40",2015
The Fate of the Furious,17,Universal,"$1,236.00",2017
Jurassic World: Fallen Kingdom,13,Universal,"$1,309.50",2018
...,...,...,...,...
Sing,134,Universal,$634.20,2016
Fast & Furious 6,86,Universal,$788.70,2013
E.T.: The Extra-Terrestrial,84,Universal,$792.90,1982
The Secret Life of Pets,63,Universal,$875.50,2016


In [15]:
# What if we want to sort by more than one columns?
universal_studio_movies.sort_values(by = ['Gross', 'Title'])

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jurassic Park,33,Universal,"$1,029.50",1993
Despicable Me 3,32,Universal,"$1,034.80",2017
Minions,19,Universal,"$1,159.40",2015
The Fate of the Furious,17,Universal,"$1,236.00",2017
Jurassic World: Fallen Kingdom,13,Universal,"$1,309.50",2018
...,...,...,...,...
Sing,134,Universal,$634.20,2016
Fast & Furious 6,86,Universal,$788.70,2013
E.T.: The Extra-Terrestrial,84,Universal,$792.90,1982
The Secret Life of Pets,63,Universal,$875.50,2016


In [16]:
# Now what if we want to subset the dataframe based on multiple conditions?
# (e.g., movies by Universal released after 1994)

universal_studio = data['Studio'] == 'Universal'
released_after_1994 = data['Year'] > 1994
data[universal_studio & released_after_1994].head()

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jurassic World,6,Universal,"$1,671.70",2015
Furious 7,8,Universal,"$1,516.00",2015
Jurassic World: Fallen Kingdom,13,Universal,"$1,309.50",2018
The Fate of the Furious,17,Universal,"$1,236.00",2017
Minions,19,Universal,"$1,159.40",2015


In [17]:
# If we wanted instead to find movies either from Universal or released after 1994
movies_either_from_universal_or_after_1994 = data[universal_studio | released_after_1994]

In [18]:
movies_either_from_universal_or_after_1994.head()

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avengers: Endgame,1,Buena Vista,"$2,796.30",2019
Avatar,2,Fox,"$2,789.70",2009
Titanic,3,Paramount,"$2,187.50",1997
Star Wars: The Force Awakens,4,Buena Vista,"$2,068.20",2015
Avengers: Infinity War,5,Buena Vista,"$2,048.40",2018


In [19]:
# We can sort by index which happens to be the title in this case
data.sort_index().head()

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"10,000 B.C.",536,Warner Brothers,$269.80,2008
101 Dalmatians,708,Buena Vista,$215.90,1961
101 Dalmatians,425,Buena Vista,$320.70,1996
2 Fast 2 Furious,632,Universal,$236.40,2003
2012,93,Sony,$769.70,2009


In [20]:
data.sort_index().tail()

Unnamed: 0_level_0,Rank,Studio,Gross,Year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Yogi Bear,778,Warner Brothers,$201.60,2010
You've Got Mail,582,Warner Brothers,$250.80,1998
Your Name.,356,FUN,$358.00,2017
Zootopia,37,Buena Vista,"$1,023.80",2016
xXx: The Return of Xander Cage,385,Paramount,$346.10,2017


In [21]:
# Select values in a range (e.g. 1980-1989 for year)
eighties = data['Year'].between(1980, 1989)
data[eighties].head()
data[eighties]['Year'].unique()

array([1982, 1980, 1983, 1989, 1981, 1985, 1986, 1988, 1984, 1987])

## Series

In [22]:
# Import libraries
import pandas as pd
import numpy as np

In [23]:
# Create a Series with values from an array
ice_cream_flavor = ["Chocolate", "Banana", "Vanilla"]

In [24]:
type(ice_cream_flavor)

list

In [25]:
# Create a series with ice cream flavors
ice_cream_series = pd.Series(ice_cream_flavor)

In [26]:
type(ice_cream_series)

pandas.core.series.Series

In [27]:
ice_cream_series

0    Chocolate
1       Banana
2      Vanilla
dtype: object

In [28]:
# Subset series
ice_cream_series[0]

'Chocolate'

In [30]:
# Reverse the order of elements - same as in the case of an array
ice_cream_series[::-1]

2      Vanilla
1       Banana
0    Chocolate
dtype: object

In [32]:
numbers = pd.Series([1, 2, 3, 4, 5], dtype = "float")
numbers

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [36]:
# Non-available (missing) values in a Series
numbers = pd.Series([1, 2, 3, np.nan, 5])

In [37]:
numbers

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
dtype: float64

In [38]:
# Create a series from a dictionary
calorie_info = {
             "Cereal": 125,
             "Chocolate Bar": 406,
             "Ice Cream Sundae": 342,
         }

In [39]:
calories = pd.Series(calorie_info)

In [40]:
calories

Cereal              125
Chocolate Bar       406
Ice Cream Sundae    342
dtype: int64

In [41]:
calories.values

array([125, 406, 342])

In [42]:
calories.index

Index(['Cereal', 'Chocolate Bar', 'Ice Cream Sundae'], dtype='object')

In [44]:
print(calories.size)
print(calories.shape)

3
(3,)


In [45]:
numbers_to_100 = pd.Series([i for i in range(1,101,1)])

In [46]:
numbers_to_100

0       1
1       2
2       3
3       4
4       5
     ... 
95     96
96     97
97     98
98     99
99    100
Length: 100, dtype: int64

In [47]:
np.sum(numbers_to_100)

5050

In [49]:
# Some simple series methods
print(numbers_to_100.head())
print(numbers_to_100.tail())

0    1
1    2
2    3
3    4
4    5
dtype: int64
95     96
96     97
97     98
98     99
99    100
dtype: int64


In [50]:
# Statistical operations on series
numbers_to_100.count()

100

In [51]:
numbers_to_100.sum()

5050

In [52]:
numbers_to_100.sum(skipna=False)

5050

In [57]:
# Calculate the product of numbers from 1 to 100
numbers_to_10 = pd.Series([i for i in range(1,11,1)])
print(numbers_to_10)
print(numbers_to_10.product())

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64
3628800


In [54]:
numbers_to_100

0       1
1       2
2       3
3       4
4       5
     ... 
95     96
96     97
97     98
98     99
99    100
Length: 100, dtype: int64

In [59]:
# Calculate the mean
numbers_to_10.mean()

5.5

In [60]:
# Calculate the median
numbers_to_10.median()

5.5

In [61]:
# Find the minimum value
numbers_to_10.min()

1

In [62]:
# Find the maximum value
numbers_to_10.max()

10

In [63]:
# To collectively describe the series
numbers_to_10.describe()

count    10.00000
mean      5.50000
std       3.02765
min       1.00000
25%       3.25000
50%       5.50000
75%       7.75000
max      10.00000
dtype: float64

In [64]:
# Find the unique values in the Series
numbers_to_10.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [65]:
# Find the number of unique values in the series
numbers_to_10.nunique()

10

In [67]:
# Return the number of rows
len(numbers_to_10)

10

In [68]:
# Convert the Series object into a list
list(numbers_to_10)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [69]:
# Convert the Series into a dictionary
dict(numbers_to_10)

{0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10}