# Pandas - Series / DataFrame -

## Series in Pandas

In [4]:
import pandas as pd
obj = pd.Series([4, 7, -5, 3])

print(obj)
print(obj.values)
print(obj.index)

obj2 = pd.Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])
obj2

0    4
1    7
2   -5
3    3
dtype: int64
[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)


d    4
b    7
a   -5
c    3
dtype: int64

## Series: construction

In [5]:
sdata = {'Ohio': 35000, 'Texas':71000, 'Oregon':16000, 'Utah':500}
obj3 = pd.Series(sdata)
print(obj3)


print('-_-_-\n')

states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index = states)
obj4

Ohio      35000
Texas     71000
Oregon    16000
Utah        500
dtype: int64
-_-_-



California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

## Series: data retrieval

In [6]:
obj2 = pd.Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])
print(obj2)

print(obj2['a'])
print(obj2[2])

obj2['d'] = 6
print(obj2)

d    4
b    7
a   -5
c    3
dtype: int64
-5
-5
d    6
b    7
a   -5
c    3
dtype: int64


In [7]:
print(obj2[['c', 'a', 'd']])
obj2[0:2]

c    3
a   -5
d    6
dtype: int64


d    6
b    7
dtype: int64

## Series: arithmetic oerations

In [8]:
obj2[obj2 >0]

d    6
b    7
c    3
dtype: int64

In [9]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [11]:
import numpy as np
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [12]:
print('b' in obj2)
print('e' in obj2)

True
False


In [14]:
print(obj3)
print(obj4)

print(obj3 + obj4)

Ohio      35000
Texas     71000
Oregon    16000
Utah        500
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64


## Dataframe: construction

In [15]:
data = [[1.4, 2.5],
       [7.1, -4.5],
       [3.4, np.nan],
       [0.75, -1.3]]

df = pd.DataFrame(data,
                 index = ['a', 'b', 'c', 'd'],
                 columns = ['one', 'two'])

df

Unnamed: 0,one,two
a,1.4,2.5
b,7.1,-4.5
c,3.4,
d,0.75,-1.3


In [16]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year':[2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [17]:
frame2 = pd.DataFrame(data,
                     columns = ['year', 'state', 'pop', 'debt'],
                     index = ['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [20]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
      'Ohio': {2000:1.5, 2001: 1.7, 2002: 3.6}}

print(pop)
frame3 = pd.DataFrame(pop)
frame3

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}


Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [19]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


## Dataframe - data retrieval

In [36]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [44]:
print(frame3[['Nevada', 'Ohio']])
print('-_-\n')
print(frame3[frame3.index == 2000])
print('-_-\n')
print(frame3[:1])
print('-_-\n')

print(frame3[[True, False, True]])
print('-_-\n')
print(frame3['Ohio'][2000])
print()
print(frame3['Ohio'][0:2])

      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2000     NaN   1.5
-_-

      Nevada  Ohio
2000     NaN   1.5
-_-

      Nevada  Ohio
2001     2.4   1.7
-_-

      Nevada  Ohio
2001     2.4   1.7
2000     NaN   1.5
-_-

1.5

2001    1.7
2002    3.6
Name: Ohio, dtype: float64


## DataFrame - column

In [47]:
print(frame2['state'])
print(frame2.state)

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object


In [48]:
frame2['debt']= 16.5
print(frame2)

frame2['debt'] = np.arange(5)
print(frame2)

frame2['eastern'] = frame2.state == 'Ohio'
frame2

       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5
       year   state  pop  debt
one    2000    Ohio  1.5     0
two    2001    Ohio  1.7     1
three  2002    Ohio  3.6     2
four   2001  Nevada  2.4     3
five   2002  Nevada  2.9     4


Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,0,True
two,2001,Ohio,1.7,1,True
three,2002,Ohio,3.6,2,True
four,2001,Nevada,2.4,3,False
five,2002,Nevada,2.9,4,False


## DataFrame - row

In [52]:
print(frame2[:2])
print(frame2[frame2.index=='Ohio'])
print(frame2.index.isin(['year', 'state']))
print(frame2['year']>2000)
print(frame2[frame2['year']>2000])

     year state  pop  debt  eastern
one  2000  Ohio  1.5     0     True
two  2001  Ohio  1.7     1     True
Empty DataFrame
Columns: [year, state, pop, debt, eastern]
Index: []
[False False False False False]
one      False
two       True
three     True
four      True
five      True
Name: year, dtype: bool
       year   state  pop  debt  eastern
two    2001    Ohio  1.7     1     True
three  2002    Ohio  3.6     2     True
four   2001  Nevada  2.4     3    False
five   2002  Nevada  2.9     4    False


## Data Frame - filtering

In [53]:
frame2 <5

TypeError: '<' not supported between instances of 'str' and 'int'

## Essential Functionality: arithmetic and data alignment

In [67]:
from pandas import DataFrame, Series
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns = list('bcd'),
               index = ['Ohio', 'Texas', 'Colorado'])

df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns = list('bde'),
               index = ['Utah','Ohio', 'Texas', 'Oregon'])
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [58]:
arr = np.arange(12.).reshape((3,4))
arr_ser = arr[0]

arr - arr_ser

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [63]:
series3 = df2['d']
df2.sub(series3, axis = 0)

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


## Essential Functionality: function application and mapping

In [64]:
np.abs(df2)

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [66]:
f = lambda x: x.max() - x.min()

print(df2.apply(f))
df2.apply(f, axis = 1)

b    9.0
d    9.0
e    9.0
dtype: float64


Utah      2.0
Ohio      2.0
Texas     2.0
Oregon    2.0
dtype: float64

In [70]:
def f(x):
    return Series([x.min(), x.max()], index = ['min', 'max'])
df2.apply(f)

Unnamed: 0,b,d,e
min,0.0,1.0,2.0
max,9.0,10.0,11.0


In [71]:
format = lambda x: '%.2f' % x
df1.applymap(format)

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [72]:
df1.apply(format)

TypeError: cannot convert the series to <class 'float'>

## Essential Functionality: sorting and ranking

In [73]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [74]:
df1.sort_index()

Unnamed: 0,b,c,d
Colorado,6.0,7.0,8.0
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0


In [75]:
df1.sort_index(axis = 1)

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [77]:
df1.sort_index(axis = 1, ascending = False)

Unnamed: 0,d,c,b
Ohio,2.0,1.0,0.0
Texas,5.0,4.0,3.0
Colorado,8.0,7.0,6.0


In [78]:
obj = Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [80]:
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [81]:
df1.sort_values(by = 'b')

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [82]:
df1.sort_values(by = ['b', 'c'])

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [86]:
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [85]:
obj.rank()

0    3.0
1    NaN
2    4.0
3    NaN
4    1.0
5    2.0
dtype: float64

In [87]:
obj.rank(method = 'min') # 같은 값이 있을 때 min/max로
obj.rank(method = 'max')
obj.rank(method = 'first') # 같은 값이 있을 때 처음나오는거 먼저
obj.rank(ascending = False)


0    2.0
1    NaN
2    1.0
3    NaN
4    4.0
5    3.0
dtype: float64

In [88]:
df1.rank()

Unnamed: 0,b,c,d
Ohio,1.0,1.0,1.0
Texas,2.0,2.0,2.0
Colorado,3.0,3.0,3.0


In [89]:
df1.rank(axis = 1)

Unnamed: 0,b,c,d
Ohio,1.0,2.0,3.0
Texas,1.0,2.0,3.0
Colorado,1.0,2.0,3.0


## Essential Functionality: Summarizing and Computing Descriptive Statistics

In [91]:
df1.sum()

b     9.0
c    12.0
d    15.0
dtype: float64

In [92]:
df1.sum(axis = 1)

Ohio         3.0
Texas       12.0
Colorado    21.0
dtype: float64

In [93]:
df1.mean(axis = 1, skipna = False)

Ohio        1.0
Texas       4.0
Colorado    7.0
dtype: float64

In [94]:
df1.idxmin()

b    Ohio
c    Ohio
d    Ohio
dtype: object

In [95]:
df1.idxmax()

b    Colorado
c    Colorado
d    Colorado
dtype: object

In [96]:
df1.cumsum()

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,5.0,7.0
Colorado,9.0,12.0,15.0


In [97]:
df1.cumsum(axis=1)

Unnamed: 0,b,c,d
Ohio,0.0,1.0,3.0
Texas,3.0,7.0,12.0
Colorado,6.0,13.0,21.0


In [98]:
df1.describe()

Unnamed: 0,b,c,d
count,3.0,3.0,3.0
mean,3.0,4.0,5.0
std,3.0,3.0,3.0
min,0.0,1.0,2.0
25%,1.5,2.5,3.5
50%,3.0,4.0,5.0
75%,4.5,5.5,6.5
max,6.0,7.0,8.0


In [99]:
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [100]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [101]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj.unique())
obj.value_counts()

['c' 'a' 'd' 'b']


a    4
c    3
b    2
d    1
dtype: int64

## Summarizing and Computing Descriptive Statistcis -Unique Values, Value Counts, and Membership

In [103]:
data = DataFrame({'Qu1':[1,3,4,3,4],
                 'Qu2':[2,3,1,2,3],
                 'Qu3':[1,5,2,4,4]})

data.apply(pd.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [105]:
print(obj)
obj.isin(['b', 'c'])

0    c
1    a
2    d
3    a
4    a
5    a
6    b
7    b
8    c
9    c
dtype: object


0     True
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool

In [106]:
data.describe()

Unnamed: 0,Qu1,Qu2,Qu3
count,5.0,5.0,5.0
mean,3.0,2.2,3.2
std,1.224745,0.83666,1.643168
min,1.0,1.0,1.0
25%,3.0,2.0,2.0
50%,3.0,2.0,4.0
75%,4.0,3.0,4.0
max,4.0,3.0,5.0
