In [3]:
#==================================
#Relational Algebra
#==================================
# Operations
#==================================
# - Projection and restriction
# - Set operations (union, difference, intersection)
# - Cartesian product
# - Join
# - Aggregation
#==================================


import numpy as np
import pandas as pd
# We can create a series from a list
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print("data looks like a numpy array: ", data)

# We can manually specify indexes
data = pd.Series([0.25, 0.5, 0.75, 1.0],        
                index=['a', 'b', 'c', 'd'])
print("data looks like a Python dict: ", data)

print(data['b'])
# We can create a Series directly from a dict:
population_dict = {'California': 38332521,
                    'Texas': 26448193,
                    'New York': 19651127,               
                    'Florida': 19552860,                  
                    'Illinois': 12882135}
area_dict = {'California': 423967,
            'Texas': 695662,
            'New York': 141297,
            'Florida': 170312,
            'Illinois': 149995}

population = pd.Series(population_dict)
area = pd.Series(area_dict)
print(population)
# What do you think of this line?
print(population['California':'Florida'])


data looks like a numpy array:  0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
data looks like a Python dict:  a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64


In [9]:
# From a Series
df = pd.DataFrame(population, columns=['population'])
print(df)

# From a list of dict
data = [{'a': i, 'b': 2 * i+1} for i in range(3)]
df = pd.DataFrame(data)
print("---------")
print(df)

# From several Series
df = pd.DataFrame({'population': population,
                    'area': area})
print(df)

# From a 2-dimensional Numpy array
df = pd.DataFrame(np.random.rand(3, 2),
                columns=['foo', 'bar'],
                index=['a', 'b', 'c'])
print(df)

# A function to easily generate DataFrames. It will be very 
# useful in the rest of this chapter.
def make_df(cols, ind):    
    """Quickly create DataFrames"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)
    
# example
make_df('ABC', range(3))

            population
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
---------
   a  b
0  0  1
1  1  3
2  2  5
            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995
        foo       bar
a  0.064676  0.495723
b  0.646031  0.222895
c  0.932614  0.605797


Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [13]:
make_df('WXYZ', range(4))
#make_df('ABCD', range(4))

Unnamed: 0,W,X,Y,Z
0,W0,X0,Y0,Z0
1,W1,X1,Y1,Z1
2,W2,X2,Y2,Z2
3,W3,X3,Y3,Z3
