# In this session, we learn how to use dataFrame to perform relational algebra operations. In other words, how to manipulate table data using dataFrame

## <font color="blue"> 1. Set Up For Our Lab</font>

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Please download the "movie" excel file to your working folder and read it
movie = pd.read_excel('Movie.xlsx')
movie

Unnamed: 0,MovieID,MovieName,Price,Genre
0,F1,Jurassic Park III,2.99,Action
1,F2,The Others,3.55,Horror
2,F3,Senseless,5.99,Comedy
3,F4,Dragon Heart,6.0,Drama
4,F5,7,5.0,Action
5,F6,America Sweetheart,0.59,Comedy


## <font color="blue"> 2. Selection </font>

In [4]:
# In relational algebra, we talk about how to apply certain selection criteria to select a subset of rows.
# This is done in pandas through "conditional selection" using bracket notation, very similar to numpy. 
# Exercise 1: Selecti all rows with a price higher than $3
# STEP 1: We need to conduct the logic comparison first:
movie.Price > 3

0    False
1     True
2     True
3     True
4     True
5    False
Name: Price, dtype: bool

In [5]:
# STEP 2: We use this bolean series in our bracket as a selection criteria
movie[movie.Price > 3]


Unnamed: 0,MovieID,MovieName,Price,Genre
1,F2,The Others,3.55,Horror
2,F3,Senseless,5.99,Comedy
3,F4,Dragon Heart,6.0,Drama
4,F5,7,5.0,Action


In [6]:
#Excercise 1: Select all movies with a price higher than $5 OR lower than $3
movie[(movie.Price < 3) | (movie.Price > 5)]

Unnamed: 0,MovieID,MovieName,Price,Genre
0,F1,Jurassic Park III,2.99,Action
2,F3,Senseless,5.99,Comedy
3,F4,Dragon Heart,6.0,Drama
5,F6,America Sweetheart,0.59,Comedy


In [7]:
#Excercise 2: Select all action movies
movie[(movie.Genre == "Action")]

Unnamed: 0,MovieID,MovieName,Price,Genre
0,F1,Jurassic Park III,2.99,Action
4,F5,7,5.0,Action


In [8]:
#Selecting rows based on multiple column conditions using '&' operator.
# When we have multiple conditions, we use the '&' logic operator to help:
# Select all action movies cheaper than $4
# 1. Action movies:
a = movie.Genre == 'Action'

In [9]:
#2. Greater than $4:
b = movie.Price > 4

In [10]:
#3. now we need to merget the two with the "&" logic
a & b

0    False
1    False
2    False
3    False
4     True
5    False
dtype: bool

In [11]:
#4. Finally, return the row:
movie[a&b]

Unnamed: 0,MovieID,MovieName,Price,Genre
4,F5,7,5.0,Action


In [12]:
# Or, we can just do:
movie[(movie.Genre == 'Action') & (movie.Price > 4)]

Unnamed: 0,MovieID,MovieName,Price,Genre
4,F5,7,5.0,Action


## <font color="blue"> 3.  Projection </font>

In [13]:
# The "Projection" operator in relational algebra is simply choosing a column in Pandas
# Select the Price column:
movie.Price

0    2.99
1    3.55
2    5.99
3    6.00
4    5.00
5    0.59
Name: Price, dtype: float64

In [14]:
# What if I just want to select UNIQUE values? First, let me introduce a redundant value:
movie.iloc[4,2] = 5.99
movie

Unnamed: 0,MovieID,MovieName,Price,Genre
0,F1,Jurassic Park III,2.99,Action
1,F2,The Others,3.55,Horror
2,F3,Senseless,5.99,Comedy
3,F4,Dragon Heart,6.0,Drama
4,F5,7,5.99,Action
5,F6,America Sweetheart,0.59,Comedy


In [15]:
# Now we can use the "unique" method to choose unique values
movie.Price.unique()

array([2.99, 3.55, 5.99, 6.  , 0.59])

In [16]:
# Select "Price" and "Genre":
movie[['Price','Genre']]

Unnamed: 0,Price,Genre
0,2.99,Action
1,3.55,Horror
2,5.99,Comedy
3,6.0,Drama
4,5.99,Action
5,0.59,Comedy


In [None]:
# Of course, you can still use all the loc and iloc tricks we talked about before...

## <font color="blue"> 4.  Group By </font>

In [17]:
# The groupby method allows you to group rows of data together 
movie.groupby('Genre')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1616b8c70>

In [18]:
print(movie.groupby('Genre'))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x16149c7f0>


In [19]:
# Notice that you CANNOT print the actual group by object -- it provides you with a ground for aggregation functions
# For example:
print(movie.groupby('Genre').min())
print(movie.groupby('Genre').max())
print(movie.groupby('Genre').count())
print(movie.groupby('Genre').mean())
print(movie.groupby('Genre').std())

       MovieID  Price
Genre                
Action      F1   2.99
Comedy      F3   0.59
Drama       F4   6.00
Horror      F2   3.55
       MovieID  Price
Genre                
Action      F5   5.99
Comedy      F6   5.99
Drama       F4   6.00
Horror      F2   3.55
        MovieID  MovieName  Price
Genre                            
Action        2          2      2
Comedy        2          2      2
Drama         1          1      1
Horror        1          1      1
        Price
Genre        
Action   4.49
Comedy   3.29
Drama    6.00
Horror   3.55
           Price
Genre           
Action  2.121320
Comedy  3.818377
Drama        NaN
Horror       NaN


  print(movie.groupby('Genre').min())
  print(movie.groupby('Genre').max())


In [20]:
# Or, the do-it-all summary stats operator:
print(movie.groupby('Genre').describe())

       Price                                              
       count  mean       std   min   25%   50%   75%   max
Genre                                                     
Action   2.0  4.49  2.121320  2.99  3.74  4.49  5.24  5.99
Comedy   2.0  3.29  3.818377  0.59  1.94  3.29  4.64  5.99
Drama    1.0  6.00       NaN  6.00  6.00  6.00  6.00  6.00
Horror   1.0  3.55       NaN  3.55  3.55  3.55  3.55  3.55
