# Pandas

In [20]:
import pandas as pd
import numpy as np

In [2]:
from pandas import Series, DataFrame

### Series

In [3]:
obj = pd.Series([4, 7, -5, 3])

In [4]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj.values

array([ 4,  7, -5,  3])

In [6]:
obj.index


RangeIndex(start=0, stop=4, step=1)

Create Series from a dict

In [7]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [8]:
obj2 = pd.Series(sdata)

In [9]:
obj2

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

### DataFrame

There are many ways to construct a DataFrame, though one of the most common is from a dict of equal-length lists or NumPy arrays.

In [10]:
data = {
        'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
       }

In [11]:
frame = pd.DataFrame(data)

In [12]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [13]:
pd.DataFrame(data, columns = ["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [16]:
pd.DataFrame(data, columns = ["year", "state", "pop", "debt"], index=["one", "two", "three", "four", "five", "six"])

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute.

In [17]:
# dict-like notations
frame["state"]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [18]:
# Like and attribute
frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

Columns can be modified by assignment

In [19]:
frame["debt"] = 16.5
frame

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,16.5
1,Ohio,2001,1.7,16.5
2,Ohio,2002,3.6,16.5
3,Nevada,2001,2.4,16.5
4,Nevada,2002,2.9,16.5
5,Nevada,2003,3.2,16.5


In [22]:
frame["debt"] = np.arange(6.)
frame

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,1.0
2,Ohio,2002,3.6,2.0
3,Nevada,2001,2.4,3.0
4,Nevada,2002,2.9,4.0
5,Nevada,2003,3.2,5.0


### Reindexing

An important method on pandas objects is reindex, which means to create a new object with the data conformed to a new index.

In [24]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [25]:
obj3.reindex(range(6), method = "ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [30]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])
states = ["Texas", "Utah", "California"]
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [32]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


### Arithmetic and Data Alignment

When you are adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs.

In [34]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])

s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [35]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [36]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [38]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

### Arithmetic methods with fill values

In [39]:
s1.add(s2, fill_value=0)

a    5.2
c    1.1
d    3.4
e    0.0
f    4.0
g    3.1
dtype: float64

### Operations between DataFrame and Series

In [40]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [43]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [44]:
series = frame.iloc[0]

In [46]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

### Function Application and Mapping

In [47]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,1.69441,1.253201,-1.805324
Ohio,-0.367105,2.743689,-0.710477
Texas,0.80849,0.970935,-0.021369
Oregon,-0.449928,0.048726,1.602714


In [48]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.69441,1.253201,1.805324
Ohio,0.367105,2.743689,0.710477
Texas,0.80849,0.970935,0.021369
Oregon,0.449928,0.048726,1.602714


In [49]:
f = lambda x: x.max() - x.min()

In [50]:
frame.apply(f)

b    2.144338
d    2.694963
e    3.408038
dtype: float64

### Sorting and Ranking

In [51]:
obj = pd.Series(range(4), index=["d", "a", "b", "c"])

In [52]:
obj

d    0
a    1
b    2
c    3
dtype: int64

In [53]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [54]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)), index=["three", "one"], columns=["d", "a", "b", "c"])

In [55]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [56]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [57]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


The data is sorted in ascending order by default, but can be sorted in descending order, too.

In [58]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


To Sort a Series by its values, use its **sort_values** method.

In [60]:
obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [61]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

#### Ranking

In [62]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [63]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

Ranking can also be assigned according to the order in qhich theyre observed in the data.

In [64]:
obj.rank(method="first")

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

Here, instead of using the average rank 6.5 for the entries 0 and 2, they instead have been set to 6 and 7 because label 0 precedes label 2 in the data.

### Unique Values, Value Counts, and Memebership

In [67]:
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [69]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [70]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [71]:
pd.value_counts(obj.values, sort=False)

c    3
d    1
b    2
a    3
dtype: int64