## Pandas -- Series and Data Frames

In [1003]:
import pandas as pd
import numpy  as np

### Series
- A one-dimensional array-like object containing a sequence of values
- Associated array of data labels, called its index

In [1004]:
np.random.seed(123)
scores = np.random.randint(60, 90, 6)
print(type(scores))
a = pd.Series(scores)
print(a)

<class 'numpy.ndarray'>
0    73
1    62
2    88
3    62
4    66
5    77
dtype: int64


In [1005]:
a.values

array([73, 62, 88, 62, 66, 77])

In [1006]:
a.index

RangeIndex(start=0, stop=6, step=1)

In [1007]:
a[1]

62

In [1008]:
type(a[1])

numpy.int64

In [1009]:
# Indexing
a[[1, 4]]

1    62
4    66
dtype: int64

In [1010]:
# Slicing
a[::-2]

5    77
3    62
1    62
dtype: int64

# Create a Pandas Data Series in Pandas

In [1011]:
b = pd.Series(scores, index = ['Alice', 'Bob', 'Charlie', 'Dave', 'Ed', 'Fred'])
b

Alice      73
Bob        62
Charlie    88
Dave       62
Ed         66
Fred       77
dtype: int64

In [1012]:
b['Bob']

62

In [1013]:
b[['Bob', 'Ed']]

Bob    62
Ed     66
dtype: int64

In [1014]:
b[::-2]

Fred    77
Dave    62
Bob     62
dtype: int64

In [1015]:
b[b > 70]

Alice      73
Charlie    88
Fred       77
dtype: int64

In [1016]:
b + 10

Alice      83
Bob        72
Charlie    98
Dave       72
Ed         76
Fred       87
dtype: int64

In [1017]:
b

Alice      73
Bob        62
Charlie    88
Dave       62
Ed         66
Fred       77
dtype: int64

# Cumulative sum of the elements along a given axis
np.cumsum()

In [1018]:
# uncomment and read more about this function from the help
# help(np.cumsum)

In [1019]:
np.cumsum(b)

Alice       73
Bob        135
Charlie    223
Dave       285
Ed         351
Fred       428
dtype: int64

In [1020]:
np.average(b)

71.33333333333333

In [1021]:
# uncomment and read more about this function from the help

# help(pd.Series.describe)

In [1022]:
b.describe()

count     6.000000
mean     71.333333
std      10.152175
min      62.000000
25%      63.000000
50%      69.500000
75%      76.000000
max      88.000000
dtype: float64

In [1023]:
'Charlie' in b

True

In [1024]:
'Robert' in b

False

In [1025]:
b.index.name = 'First Name'
b

First Name
Alice      73
Bob        62
Charlie    88
Dave       62
Ed         66
Fred       77
dtype: int64

## Create pd.Series from python dictionary data

In [1026]:
dict = {'R': 60, 'Python': 75, 'Java': 50}
c = pd.Series(dict)
c

R         60
Python    75
Java      50
dtype: int64

In [1027]:
d = pd.Series({'R': 60, 'Python': 75, 'Java': 50}, 
              index=['Java', 'Python', 'R', 'C++'])
d

Java      50.0
Python    75.0
R         60.0
C++        NaN
dtype: float64

In [1028]:
pd.isnull(d)

Java      False
Python    False
R         False
C++        True
dtype: bool

In [1029]:
pd.notnull(d)

Java       True
Python     True
R          True
C++       False
dtype: bool

In [1030]:
c + d

C++         NaN
Java      100.0
Python    150.0
R         120.0
dtype: float64

In [1031]:
(c + d).dropna()

Java      100.0
Python    150.0
R         120.0
dtype: float64

# Pandas DataFrame
- It represents a rectangular table of data 
- It contains an ordered collection of columns 
- Each column can include elements of different value types
- It has both a row and column index

# Create Pandas Dataframes from Python Dict

In [1032]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [1033]:
type(data)

dict

In [1034]:
# Create a pd dataframe 
df1 = pd.DataFrame(data)
df1

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [1035]:
# Read more about Dataframe on the main documentation here 
# help(pd.DataFrame)

### Rename Columns

In [1036]:
df1 = pd.DataFrame(data, columns = ['year', 'state', 'pop'])
df1

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [1037]:
# Access the head of a dataframe
df1.head()

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [1038]:
# access the tail of a dataframe
df1.tail(n=3)

Unnamed: 0,year,state,pop
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


# Add another column to Dataframe

In [1039]:
df2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'])
df2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [1040]:
df2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [1041]:
df2.index

RangeIndex(start=0, stop=6, step=1)

#### Access Dataframe Columns

In [1042]:
df2['year']

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [1043]:
# Check the type of the returend object
x = df2['year']
type(x)

pandas.core.series.Series

In [1044]:
df2.year # attribute

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [1045]:
df2[['year', 'state']]

Unnamed: 0,year,state
0,2000,Ohio
1,2001,Ohio
2,2002,Ohio
3,2001,Nevada
4,2002,Nevada
5,2003,Nevada


#### Retrieve rows

In [1046]:
df2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'])
df2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [1047]:
# uncomment and read more about this function from the help

# help(pd.DataFrame.iloc)

In [1048]:
# print(df2)
# access rows
df2.iloc[2]

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object

In [1049]:
type(df2.iloc[2])

pandas.core.series.Series

In [1050]:
df2.iloc[[2]]

Unnamed: 0,year,state,pop,debt
2,2002,Ohio,3.6,


In [1051]:
type(df2.iloc[[2]])

pandas.core.frame.DataFrame

In [1052]:
df2.iloc[[2,5]]

Unnamed: 0,year,state,pop,debt
2,2002,Ohio,3.6,
5,2003,Nevada,3.2,


In [1053]:
df2.index = ['one', 'two', 'three', 'four', 'five', 'six']
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [1054]:
df2.loc['two']

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: two, dtype: object

In [1055]:
df2.loc[['two','five']]

Unnamed: 0,year,state,pop,debt
two,2001,Ohio,1.7,
five,2002,Nevada,2.9,


In [1056]:
df2['debt'] = 20.5
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,20.5
two,2001,Ohio,1.7,20.5
three,2002,Ohio,3.6,20.5
four,2001,Nevada,2.4,20.5
five,2002,Nevada,2.9,20.5
six,2003,Nevada,3.2,20.5


In [1057]:
df2['debt'] = np.arange(df2.shape[0])
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4
six,2003,Nevada,3.2,5


In [1058]:
# adding a column

df2['east'] = (df2.state == 'Ohio')
df2

Unnamed: 0,year,state,pop,debt,east
one,2000,Ohio,1.5,0,True
two,2001,Ohio,1.7,1,True
three,2002,Ohio,3.6,2,True
four,2001,Nevada,2.4,3,False
five,2002,Nevada,2.9,4,False
six,2003,Nevada,3.2,5,False


In [1059]:
# deleting a column
del df2['east']
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4
six,2003,Nevada,3.2,5


In [1060]:
# Access Transpose of the Table
df2.T

Unnamed: 0,one,two,three,four,five,six
year,2000,2001,2002,2001,2002,2003
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9,3.2
debt,0,1,2,3,4,5


## Create Dataframe from nested dictionaries

In [1061]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

df3 = pd.DataFrame(pop)
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


### Pandas Series Reindexing

In [1062]:
df1 = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
df1

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [1063]:
df2 = df1.reindex(['a', 'b', 'c', 'd', 'e'])
df2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [1064]:
# Read more about it here
# help(pd.Series.reindex)

In [1065]:
df3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
df3

0      blue
2    purple
4    yellow
dtype: object

In [1066]:
df3.reindex(np.arange(6))

0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object

In [1067]:
# forward fill missing values
df3.reindex(np.arange(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [1068]:
# backward fill missing values

df3.reindex(np.arange(6), method='bfill')

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

In [1069]:

      
df4 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
df4

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [1070]:
df4.reindex(['a', 'b', 'c', 'd'])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [1071]:
# for reindexing columns

df4.reindex(columns = ['Utah', 'Ohio', 'Texas'])

Unnamed: 0,Utah,Ohio,Texas
a,,0,1
c,,3,4
d,,6,7


### Dropping Entries from an Axis

In [1072]:
# For Series
df1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
df1

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [1073]:
df1.drop('b')

a    0
c    2
d    3
e    4
dtype: int64

In [1074]:
df1.drop(['a', 'c'])

b    1
d    3
e    4
dtype: int64

In [1075]:
# For Data Frame
df2 = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
df2

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


## Drop Operation

In [1076]:
# Default axis is rows (0)
df2.drop('Ohio')

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [1077]:
df2.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [1078]:
# From dropping columns
df2.drop('two', axis='columns') # or axis = 1

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [1079]:
df2.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [1080]:
df2.drop(['two', 'four'], axis=1, inplace = True)
df2

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


### Indexing, Selection, and Filtering

In [1081]:
df1 = pd.Series(np.arange(10,14), index=['a', 'b', 'c', 'd'])
df1

a    10
b    11
c    12
d    13
dtype: int64

In [1082]:
df1['c']

12

In [1083]:
df1[2]

12

In [1084]:
df1[1:3]

b    11
c    12
dtype: int64

In [1085]:
# inclusive end-point

df1['b':'d']

b    11
c    12
d    13
dtype: int64

In [1086]:
df1[[3,1]]

d    13
b    11
dtype: int64

In [1087]:
df1[['d', 'b']]

d    13
b    11
dtype: int64

In [1088]:
df1[df1 < 12]

a    10
b    11
dtype: int64

In [1089]:
df1['b':'d'] = 50
df1

a    10
b    50
c    50
d    50
dtype: int64

In [1090]:
# Dataframe Indexing, Selection, and Filtering

df2 = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
df2

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [1091]:
df2['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [1092]:
df2[['two']]

Unnamed: 0,two
Ohio,1
Colorado,5
Utah,9
New York,13


In [1093]:
df2[['two', 'one']]

Unnamed: 0,two,one
Ohio,1,0
Colorado,5,4
Utah,9,8
New York,13,12


In [1094]:
# Special cases

df2[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [1095]:
df2['three'] < 10

Ohio         True
Colorado     True
Utah        False
New York    False
Name: three, dtype: bool

In [1096]:
df2[df2['three'] < 10]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [1097]:
df2

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [1098]:
df2[df2 < 10] = -1
df2

Unnamed: 0,one,two,three,four
Ohio,-1,-1,-1,-1
Colorado,-1,-1,-1,-1
Utah,-1,-1,10,11
New York,12,13,14,15


## Selecting with loc and iloc
- DataFrame label-indexing on the rows
- loc (using axis labels)
- iloc (using integer index)

In [1099]:
df2 = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
df2

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [1100]:
df2.loc['Colorado']

one      4
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [1101]:
df2.loc['Colorado', ['two', 'four']]

two     5
four    7
Name: Colorado, dtype: int64

In [1102]:
df2.iloc[1]

one      4
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [1103]:
df2.iloc[1, [1, 3]]

two     5
four    7
Name: Colorado, dtype: int64

In [1104]:
df2.iloc[[1, 2]]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


In [1105]:
df2.iloc[[1, 2], [1, 3]]

Unnamed: 0,two,four
Colorado,5,7
Utah,9,11


In [1106]:
df2.loc[:'Utah']

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11


In [1107]:
df2.loc[:'Utah', ['two', 'three']]

Unnamed: 0,two,three
Ohio,1,2
Colorado,5,6
Utah,9,10


In [1108]:
df2.iloc[:, :3]

Unnamed: 0,one,two,three
Ohio,0,1,2
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [1109]:
df2.iloc[:, :3][df2.three > 5]
# select [:, :3] and if df2.three > 5

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


## Apply Function and do mapping

- You can apply any user-defined functions to a dataframe to run your computations. 
- You can map the content of a data frame using a function to new values

In [1110]:
df1 = pd.DataFrame(np.random.randn(4, 3), columns=list('abc'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1

Unnamed: 0,a,b,c
Utah,-0.5786,1.651437,-2.426679
Ohio,-0.428913,1.265936,-0.86674
Texas,-0.678886,-0.094709,1.49139
Oregon,-0.638902,-0.443982,-0.434351


In [1111]:
np.abs(df1)

Unnamed: 0,a,b,c
Utah,0.5786,1.651437,2.426679
Ohio,0.428913,1.265936,0.86674
Texas,0.678886,0.094709,1.49139
Oregon,0.638902,0.443982,0.434351


### Apply a function on 1-D arrays to each column or row

In [1112]:
# Read more on the documentation
# help(pd.Series.apply)

In [1113]:
print(df1)

df_test= df1.apply(lambda x: x.clip(0,1))
df_test

               a         b         c
Utah   -0.578600  1.651437 -2.426679
Ohio   -0.428913  1.265936 -0.866740
Texas  -0.678886 -0.094709  1.491390
Oregon -0.638902 -0.443982 -0.434351


Unnamed: 0,a,b,c
Utah,0.0,1.0,0.0
Ohio,0.0,1.0,0.0
Texas,0.0,0.0,1.0
Oregon,0.0,0.0,0.0


In [1114]:
# default axis = 'rows'

df1.apply(lambda x: x.max() - x.min())

a    0.249974
b    2.095418
c    3.918069
dtype: float64

In [1115]:
# Invoke once per row

df1.apply(lambda x: x.max() - x.min(), axis = 'columns')

Utah      4.078116
Ohio      2.132677
Texas     2.170276
Oregon    0.204551
dtype: float64

In [1116]:
df1

Unnamed: 0,a,b,c
Utah,-0.5786,1.651437,-2.426679
Ohio,-0.428913,1.265936,-0.86674
Texas,-0.678886,-0.094709,1.49139
Oregon,-0.638902,-0.443982,-0.434351


In [1117]:
# function returning multiple values

df1.apply(lambda x: pd.Series([x.min(), x.max()], index = ['min', 'max']))

Unnamed: 0,a,b,c
min,-0.678886,-0.443982,-2.426679
max,-0.428913,1.651437,1.49139


In [1118]:
df1.apply(lambda x: pd.Series([x.min(), x.max()], index = ['min', 'max']), 
          axis='columns')

Unnamed: 0,min,max
Utah,-2.426679,1.651437
Ohio,-0.86674,1.265936
Texas,-0.678886,1.49139
Oregon,-0.638902,-0.434351


### Sorting

 - Sort lexicographically by row or column index

In [1119]:
# Series

df1 = pd.Series(np.arange(10,14), index=['d', 'a', 'b', 'c'])
df1

d    10
a    11
b    12
c    13
dtype: int64

In [1120]:
df2 = df1.sort_index()
df2

a    11
b    12
c    13
d    10
dtype: int64

In [1121]:
df2.sort_values()

d    10
a    11
b    12
c    13
dtype: int64

In [1122]:
# DataFrame

df1 = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
df1

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [1123]:
df1.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [1124]:
df1.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [1125]:
df1

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [1126]:
df1.sort_values(by ='b', ascending = False)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [1127]:
df1.sort_values(by ='one', axis = 1, ascending = False)

Unnamed: 0,c,b,a,d
three,3,2,1,0
one,7,6,5,4


### Axis indices with duplicate labels

In [1128]:
# Series

df1 = pd.Series(np.arange(10,15), index=['a', 'a', 'b', 'b', 'c'])
df1

a    10
a    11
b    12
b    13
c    14
dtype: int64

In [1129]:
df1['b']

b    12
b    13
dtype: int64

In [1130]:
df1.index.is_unique

False

In [1131]:
# DataFrame

df2 = pd.DataFrame(np.random.randint(60, 90, (4, 3)), index=['a', 'a', 'b', 'b'])
df2

Unnamed: 0,0,1,2
a,67,62,80
a,75,84,89
b,76,67,69
b,63,88,88


In [1132]:
df2.loc['b']

Unnamed: 0,0,1,2
b,76,67,69
b,63,88,88


### Descriptive Statistics

In [1133]:
df1 = pd.DataFrame([[1.5, np.nan], [7.5, -5.5],
                   [np.nan, np.nan], [1.0, -4.5]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df1

Unnamed: 0,one,two
a,1.5,
b,7.5,-5.5
c,,
d,1.0,-4.5


In [1134]:
df1.sum() # Add all rows for each column

one    10.0
two   -10.0
dtype: float64

In [1135]:
df1.sum(axis=0) # Add all rows for each column

one    10.0
two   -10.0
dtype: float64

In [1136]:
df1.sum(axis='rows') # Add all rows for each column

one    10.0
two   -10.0
dtype: float64

In [1137]:
df1.sum(axis=1) # Add all columns for each row

a    1.5
b    2.0
c    0.0
d   -3.5
dtype: float64

In [1138]:
df1.sum(axis='columns') # Add all columns for each row

a    1.5
b    2.0
c    0.0
d   -3.5
dtype: float64

In [1139]:
df1

Unnamed: 0,one,two
a,1.5,
b,7.5,-5.5
c,,
d,1.0,-4.5


#### idxmax, idxmin 
- index labels of maximum and minimum values

#### argmax, argmin  (Series)
 - index locations of maximum and minimum values for a Series

In [1140]:
print(df1)

df1.idxmax() # index of max for each column

   one  two
a  1.5  NaN
b  7.5 -5.5
c  NaN  NaN
d  1.0 -4.5


one    b
two    d
dtype: object

In [1141]:
df1.idxmax(axis='columns')

a    one
b    one
c    NaN
d    one
dtype: object

#### accumulations
 - cumsum, cumprod, cummin, cummax

In [1142]:
print(df1)

df1.cumsum()

   one  two
a  1.5  NaN
b  7.5 -5.5
c  NaN  NaN
d  1.0 -4.5


Unnamed: 0,one,two
a,1.5,
b,9.0,-5.5
c,,
d,10.0,-10.0


In [1143]:
df1.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.333333,-5.0
std,3.617089,0.707107
min,1.0,-5.5
25%,1.25,-5.25
50%,1.5,-5.0
75%,4.5,-4.75
max,7.5,-4.5


In [1144]:
df1

Unnamed: 0,one,two
a,1.5,
b,7.5,-5.5
c,,
d,1.0,-4.5


In [1145]:
np.random.seed(123)
df2 = pd.DataFrame(np.random.randint(60, 90, (4, 3)), 
                         index=['a', 'b', 'c', 'd'], 
                        columns = ['one', 'two', 'three'])
df2

Unnamed: 0,one,two,three
a,73,62,88
b,62,66,77
c,79,70,87
d,85,82,61


In [1146]:
df2.diff()

Unnamed: 0,one,two,three
a,,,
b,-11.0,4.0,-11.0
c,17.0,4.0,10.0
d,6.0,12.0,-26.0


In [1147]:
df2.diff(axis='columns')

Unnamed: 0,one,two,three
a,,-11,26
b,,4,11
c,,-9,17
d,,-3,-21


In [1148]:
df2

Unnamed: 0,one,two,three
a,73,62,88
b,62,66,77
c,79,70,87
d,85,82,61


In [1149]:
df2.pct_change() # percent change

Unnamed: 0,one,two,three
a,,,
b,-0.150685,0.064516,-0.125
c,0.274194,0.060606,0.12987
d,0.075949,0.171429,-0.298851


In [1150]:
df2['one'].cov(df2['two'])

62.666666666666664

In [1151]:
df2['one'].corr(df2['two'])

0.7392185280134137

In [1152]:
df2.cov()

Unnamed: 0,one,two,three
one,96.25,62.666667,-46.916667
two,62.666667,74.666667,-93.333333
three,-46.916667,-93.333333,156.916667


In [1153]:
df2.corr()

Unnamed: 0,one,two,three
one,1.0,0.739219,-0.381762
two,0.739219,1.0,-0.862261
three,-0.381762,-0.862261,1.0


### Unique values and value counts

In [1154]:
# Generate a Random Series
np.random.seed(123)
scores = np.random.randint(60, 70, 10)

a = pd.Series(scores)
a

0    62
1    62
2    66
3    61
4    63
5    69
6    66
7    61
8    60
9    61
dtype: int64

In [1155]:
a.unique()

array([62, 66, 61, 63, 69, 60])

In [1156]:
a.value_counts()

61    3
62    2
66    2
63    1
69    1
60    1
Name: count, dtype: int64

In [1157]:
a.values

array([62, 62, 66, 61, 63, 69, 66, 61, 60, 61])

In [1158]:
pd.value_counts(a)

61    3
62    2
66    2
63    1
69    1
60    1
Name: count, dtype: int64

In [1159]:
pd.value_counts(a.values)

61    3
62    2
66    2
63    1
69    1
60    1
Name: count, dtype: int64

In [1160]:
pd.value_counts(a.values, sort=False)

62    2
66    2
61    3
63    1
69    1
60    1
Name: count, dtype: int64

In [1161]:
a.unique()

array([62, 66, 61, 63, 69, 60])

In [1162]:
pd.Index(a.unique()).get_indexer(a)

array([0, 0, 1, 2, 3, 4, 1, 2, 5, 2])

In [1163]:
np.random.seed(321)
df2 = pd.DataFrame(np.random.randint(60, 70, (10, 4)), 
                        columns = ['Q1', 'Q2', 'Q3', 'Q4'])
df2

Unnamed: 0,Q1,Q2,Q3,Q4
0,64,69,68,61
1,68,68,64,65
2,68,63,65,61
3,64,66,65,67
4,67,62,62,63
5,69,62,61,62
6,61,61,60,64
7,64,63,60,63
8,67,64,65,67
9,60,68,67,61


In [1164]:
df2.apply(pd.value_counts)

Unnamed: 0,Q1,Q2,Q3,Q4
60,1.0,,2.0,
61,1.0,1.0,1.0,3.0
62,,2.0,1.0,1.0
63,,2.0,,2.0
64,3.0,1.0,1.0,1.0
65,,,3.0,1.0
66,,1.0,,
67,2.0,,1.0,2.0
68,2.0,2.0,1.0,
69,1.0,1.0,,


In [1165]:
df2.apply(pd.value_counts).dropna()

Unnamed: 0,Q1,Q2,Q3,Q4
61,1.0,1.0,1.0,3.0
64,3.0,1.0,1.0,1.0


In [1166]:
df2.apply(pd.value_counts).fillna(0)

Unnamed: 0,Q1,Q2,Q3,Q4
60,1.0,0.0,2.0,0.0
61,1.0,1.0,1.0,3.0
62,0.0,2.0,1.0,1.0
63,0.0,2.0,0.0,2.0
64,3.0,1.0,1.0,1.0
65,0.0,0.0,3.0,1.0
66,0.0,1.0,0.0,0.0
67,2.0,0.0,1.0,2.0
68,2.0,2.0,1.0,0.0
69,1.0,1.0,0.0,0.0
