# Installing and Using Pandas

In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [1]:
#import numpy as np
import pandas as pd
pd.__version__

'1.2.4'

In [3]:
data = pd.Series([.25,.5,.75,1])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [5]:
data.index


RangeIndex(start=0, stop=4, step=1)

In [6]:
data[2]

0.75

In [7]:
data[0:2]

0    0.25
1    0.50
dtype: float64

# Series as generalized NumPy array

In [8]:
data = pd.Series([.25,.5,.75,1],
                index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [9]:
data['b']

0.5

In [10]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=[2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [11]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [12]:
population['Texas':'Florida']

Texas       26448193
New York    19651127
Florida     19552860
dtype: int64

# The Pandas DataFrame Object

In [13]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
area=pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [14]:
states=pd.DataFrame({'population':population,
                    'area':area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [15]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [16]:
states.columns

Index(['population', 'area'], dtype='object')

In [17]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [18]:
#From a single Series object
pd.DataFrame(population,columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [19]:
#From a list of dicts
data = [{'a': i, 'b': 2 * i}  
for i in range(4)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4
3,3,6


In [20]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [21]:
# From a dictionary of Series objects
pd.DataFrame({'population': population, 'area': area})

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [22]:
%who_ls

['area', 'area_dict', 'data', 'pd', 'population', 'population_dict', 'states']

In [23]:
# From a two-dimensional NumPy array
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

NameError: name 'np' is not defined

# Ufuncs: Operations Between DataFrame and Series


In [2]:
import numpy as np
import pandas as pd

In [None]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

In [None]:
A=rng.randint(10, size=(3, 4))
A

In [None]:
A-A[0]

In [None]:
df = pd.DataFrame(A,columns=list('QRST'))
df-df.iloc[0]

In [None]:
halfrow=df.loc[0,::2]
halfrow

In [None]:
df-halfrow

In [None]:
df

In [None]:
halfrow

# Handling Missing Data

In [None]:
vals1=np.array([1,None,3,4])
vals1

In [None]:
for dtype in ['object','int']:
    print("dtype=",dtype)
    %timeit np.arange(1E6,dtype=dtype).sum()
    print()

In [None]:
vals1.sum()

In [None]:
vals2=np.array([1,np.nan,3,4])
vals2.dtype

In [None]:
np.nansum(vals2),np.nanmin(vals2)

In [None]:
pd.Series([1,np.nan,2,None])

In [None]:
x=pd.Series(range(2),dtype=int)
x

In [None]:
data=pd.Series([1,np.nan,'hello',None])
data.isnull()

In [None]:
data=pd.Series(vals2)
data.dropna()

In [None]:
df = pd.DataFrame([[1, np.nan, 2],
[2, 3, 5],
[np.nan, 4, 6]])
df

In [None]:
df.dropna(axis='columns')

In [None]:
df[3]=np.nan
df

In [None]:
df.dropna(axis='columns',how='all')

In [None]:
df.dropna(axis='rows',thresh=2)

# Filling null values

In [None]:
data = pd.Series([1, np.nan, 2, None, 6,None], index=list('abcdef'))
data

In [None]:
data.fillna(100)

In [None]:
data.fillna(method='ffill')

In [None]:
data.fillna(method='bfill')

In [None]:
df

In [None]:
df.fillna(method='ffill',axis=1)

# Hierarchical Indexing

In [24]:
import pandas as pd
import numpy as np

In [25]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [26]:
pop[('California', 2010):('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [27]:
pop[[i for i in pop.index if i[1]==2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [28]:
index=pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [29]:
pop=pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [30]:
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

## MultiIndex as extra dimension

In [31]:
pop_df=pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [32]:
pop_df.stack('')

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [33]:
pop_df=pd.DataFrame({'total':pop,'under18':[9267089, 9284094,4687374, 4318033,5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [36]:
f_u18=pop_df['under18']/pop_df['total']*100
f_u18.unstack()

Unnamed: 0,2000,2010
California,27.359428,24.921096
New York,24.700997,22.283054
Texas,28.32511,27.356773


# Combining Datasets: Concat and Append

In [1]:
import pandas as pd
import numpy as np

In [2]:
def make_df(cols, ind):
    """"""
    data={c:[str(c)+str(i) for i in ind]
         for c in cols}
    return pd.DataFrame(data,ind)

make_df('ABC',range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


# Recall: Concatenation of NumPy Arrays

In [9]:
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
type(x)
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [12]:
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

# Simple Concatenation with pd.concat

In [None]:
# Signature in Pandas v0.18
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)

In [13]:
ser1=pd.Series(['A','B','C'],index=[1,2,3])
ser2=pd.Series(['D','E','F'],index=[4,5,6])
pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [15]:
df1=make_df('AB', [1,2])
df2=make_df('AB', [3,4])
print(df1);print(df2);print(pd.concat([df1,df2]))

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [19]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3); print(df4); print(pd.concat([df3, df4], axis=1))

    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
