# Chapter 1: Pandas Foundations

In [65]:
import pandas as pd
import numpy as np

In [66]:
pd.set_option('max_columns', 8, 'max_rows', 10)

In [67]:
movie = pd.read_csv('data/movie.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
4,,Doug Walker,,,...,12.0,7.1,,0


# Accessing the main DataFrame components
## Index, columns, and values

In [14]:
columns = movie.columns
index = movie.index
data = movie.values

In [15]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [16]:
index

RangeIndex(start=0, stop=4916, step=1)

In [17]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

In [18]:
type(columns)

pandas.core.indexes.base.Index

In [19]:
type(index)

pandas.core.indexes.range.RangeIndex

In [20]:
type(data)

numpy.ndarray

In [21]:
# Lets look at whats stored in 'index' and 'columns'
index.values

array([   0,    1,    2, ..., 4913, 4914, 4915], dtype=int64)

In [22]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

# Understanding data types (of dataDrame columns)

In [23]:
# We will use the .dtypes method for movie dataFrame - Note dtypes is plural
movie.dtypes

color                       object
director_name               object
num_critic_for_reviews     float64
duration                   float64
director_facebook_likes    float64
                            ...   
title_year                 float64
actor_2_facebook_likes     float64
imdb_score                 float64
aspect_ratio               float64
movie_facebook_likes         int64
Length: 28, dtype: object

In [24]:
# we can also get a summary of how many dtypes there are in the dataFrame
# and how many variables are of each dtype

# get_dtype_counts() method ---  'dtype' is used as singular

movie.get_dtype_counts()

float64    13
int64       3
object     12
dtype: int64

# Creating a single series -- by selecting a column of dataFrame

In [25]:
# Seleting a single column of dataFrame -- by column name
movie['director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [26]:
# selecting a single column of dataFrame --- by .variable_name method

movie.director_name

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [27]:
# Can find object type of the extracted column
type(movie['director_name'])

pandas.core.series.Series

In [28]:
# can assign a variable name to the extracted series
# and can also retrieve the name of the original column in the dataFrame using .name method

director = movie.director_name
director.name

'director_name'

In [29]:
# Can also convert the extracted series to a single column dataFrame 
#    by using the chained .to_frame().head() method -- note the double use of parenthesis

director.to_frame().head()



Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


# Calling Series Methods

In [30]:
# dataFrames and series each have over 400 attributes and methods
# A large number of attribute and methods (more than 370) are common to both
# use dir() to list the attributes and methods [e.g dir(pd.Series) & dir(pd.DataFrame)]

director = movie['director_name']
actor_1_facebook_likes = movie['actor_1_facebook_likes']


In [31]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [32]:
actor_1_facebook_likes.head()


0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [33]:
# Value_counts() method provide frequency or count of values within a variable
# Doing value_counts depends on the nature of variable -- suitable for categorical variables
# Steven Spielberg directed 26 movies, Woody Allen, 22 (including the comedy 'Take the money and run')

director.value_counts()

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Martin Scorsese     20
Spike Lee           16
                    ..
Jerry Belson         1
Alan Metter          1
Vic Armstrong        1
Ethan Maniquis       1
Ossie Davis          1
Name: director_name, Length: 2397, dtype: int64

In [34]:
# value_counts may not be very interesting for some other variables

actor_1_facebook_likes.value_counts()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
          ... 
362.0        1
216.0        1
859.0        1
225.0        1
334.0        1
Name: actor_1_facebook_likes, Length: 877, dtype: int64

In [35]:
# The number of elements in a Series may be counted by size, shape methods 

director.size

4916

In [36]:
director.shape

(4916,)

In [37]:
# The number of values in a series may also be obtained by len() function

len(director)

4916

In [38]:
# count() method returns the number of non-missing values in a Series

director.count()

4814

In [39]:
actor_1_facebook_likes.count()

4909

In [40]:
# basic summary statistics may be obtained by min, max, mean, median, std, and sum methods
actor_1_facebook_likes.min(), actor_1_facebook_likes.max(), \
actor_1_facebook_likes.mean(), actor_1_facebook_likes.median(), \
actor_1_facebook_likes.std(), actor_1_facebook_likes.sum()

(0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848309, 31881444.0)

In [41]:
# Alternatively, .describe() method may be used to get descriptive statistics
actor_1_facebook_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [42]:
# quantile() memthod provides exact quantiles of numeric data

actor_1_facebook_likes.quantile(.2)

510.0

In [43]:
# we pass a list of quantile positions to the quantile() method

actor_1_facebook_likes.quantile([.1, .2, .25, .4, .5, .6, .75, .8, .9] )

0.10      240.0
0.20      510.0
0.25      607.0
0.40      854.0
0.50      982.0
0.60     1000.0
0.75    11000.0
0.80    13000.0
0.90    18000.0
Name: actor_1_facebook_likes, dtype: float64

In [44]:
# missing values can be identified by isnull() method

director.isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [45]:
# Missing values in the Series can be filled with zeros with the .fillna() method

actor_1_facebook_likes_filled = actor_1_facebook_likes.fillna(0)
actor_1_facebook_likes_filled.count()

4916

In [46]:
# To remove Series elements with missing values use .dropna() method
actor_1_facebook_likes_dropped = actor_1_facebook_likes.dropna()
actor_1_facebook_likes_dropped.count()

4909

## Getting relative frequencies --- using value_counts(normalize=True)

In [47]:
director.value_counts(normalize=True)

Steven Spielberg    0.005401
Woody Allen         0.004570
Clint Eastwood      0.004155
Martin Scorsese     0.004155
Spike Lee           0.003324
                      ...   
Jerry Belson        0.000208
Alan Metter         0.000208
Vic Armstrong       0.000208
Ethan Maniquis      0.000208
Ossie Davis         0.000208
Name: director_name, Length: 2397, dtype: float64

In [48]:
# Detect missing values in series -- using .hasnans method

director.hasnans

True

In [49]:
# Like the isnull method, there is a .notnull method, which returns True for all non-missing values

director.notnull()

0        True
1        True
2        True
3        True
4        True
        ...  
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

# Working with operators on a Series

In [50]:
# Operators are not objects themselves, but rather they force an operation to occur on the object
# Consider the arithmatic operators
5 + 9       # returns 14
4 ** 2      # returns 16
a = 10      # assigns value 10 to a 


5 < 9      # returns true


True

In [51]:
# operators can work on any type of object, not just numerical data

'abcde' + 'fg'   # returns 'abdefg'
not (5 < 9)      # returns False
7 in [1, 2, 6]   # returns False

False

In [52]:
#Intersection of two sets: returns the {2, 3}, which is the intersection

set([1,2,3]) & set([2,3,4])  

{2, 3}

In [55]:
# Not all operators are implemented for every objects
# Use of inappropriate operators on the object will produce error

[1,2,3] - 3



TypeError: unsupported operand type(s) for -: 'list' and 'int'

In [56]:
a = set([1,2,3])
a[0]

TypeError: 'set' object does not support indexing

# Applying operators on Series to get new Series

In [57]:
# select imdb_score (a measure of user ratings of films) as a Series from movie data
movie = pd.read_csv('data/movie.csv')
imdb_score = movie['imdb_score']
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
       ... 
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [58]:
# use plus operator to add 1 to teh series
imdb_score + 1


0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [59]:
imdb_score*2.5

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
        ...  
4911    19.25
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [60]:
# Floor Division
## The answer is always an integer
## The that is returned is the floor
###  So if the answer (after division) is 2.33, floor div returns 2
###  On the other hand, if the answer is -2.33, floor div returns -3 (i.e. the floor) 

imdb_score// 7


0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
4911    1.0
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [61]:
# Comparison operators (>, <, >=, <=, == )
## Each value in the series is turned into True or False depending on outcome of comparison

imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

# Chaining the Series methods together

In [69]:
# Select two series form movie data set

movie = pd.read_csv('data\movie.csv')
actor_1_fb_likes = movie['actor_1_facebook_likes']
director = movie['director_name']

In [70]:
# appending head() method at the end of the chain

director.value_counts().head()

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Martin Scorsese     20
Spike Lee           16
Name: director_name, dtype: int64

In [73]:
# Counting missing values by appending .sum method after .isnull() method 
actor_1_fb_likes.isnull().sum()


7

In [74]:
"""
We need to fill missing values with zeros
But first note:
Although facebook likes are integers, having missing values in series converts 
it into float64

We can convert the float64 back to intergers by using .astype(int) method 
"""
actor_1_fb_likes.dtype


dtype('float64')

In [76]:
#Filling missing vales by 0, and chaining .astype method and .head() at the end of fillna()

actor_1_fb_likes.fillna(0).astype(int).head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

In [78]:
# Appending the mean at the end of the pipe instead of sum gives %age of mising values

actor_1_fb_likes.isnull().mean()

0.0014239218877135883

In [79]:
# It is possible to use parenthesis instead of backslash for multiline code
# This is convenient for coding -- starting with parnthesis open

(actor_1_fb_likes.fillna(0)
                 .astype(int)
                 .head())

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32