In [2]:
import pandas as pd
import numpy as np
from IPython.display import display as d

## Series and DataFrame creation

In [6]:
print('Series with axis labels:')
d(pd.Series(np.arange(5, dtype=np.int)))

print('Series from dictionary:')
future_series = {0: 'A', 1: 'B', 2: 'C'}
d(pd.Series(future_series))

print('Basic dataframe with indexes and column names:')
dates = pd.date_range("20180101", periods=6)
data = np.random.random((6,3))
column_names = ['Column1', 'Column2', 'Column3']
main_df = pd.DataFrame(data, index=dates, columns=column_names)
d(main_df)

print('DataFrame from dictionary:')
basic_dict = {'Normal': ['A', 'B', 'C'], 'Reverse': ['Z', 'Y', 'X']}
d(pd.DataFrame(basic_dict))

print('DataFrame from list of dictionary:')
basic_dict = [
    {'Normal': 'A', 'Reverse': 'Z'},
    {'Normal': 'B', 'Reverse': 'Y'},
    {'Normal': 'C', 'Reverse': 'X'},
]
d(pd.DataFrame(basic_dict))

Series with axis labels:


0    0
1    1
2    2
3    3
4    4
dtype: int64

Series from dictionary:


0    A
1    B
2    C
dtype: object

Basic dataframe with indexes and column names:


Unnamed: 0,Column1,Column2,Column3
2018-01-01,0.601124,0.119096,0.725737
2018-01-02,0.365944,0.460533,0.699236
2018-01-03,0.633792,0.133734,0.874896
2018-01-04,0.56042,0.36603,0.245381
2018-01-05,0.90412,0.965134,0.153758
2018-01-06,0.131619,0.35176,0.945545


DataFrame from dictionary:


Unnamed: 0,Normal,Reverse
0,A,Z
1,B,Y
2,C,X


DataFrame from list of dictionary:


Unnamed: 0,Normal,Reverse
0,A,Z
1,B,Y
2,C,X


## Indexing

In [3]:
print('Index first column:')
d(main_df['Column1'])

print('Index the first two rows:')
d(main_df[0:2])

print('Index the rows by name:')
d(main_df['20180101':'20180104'])

print('Index rows and columns by names:')
main_df.loc['20180101':'20180103',['Column1','Column3']]

print('Index rows and columns by numbers:')
d(main_df.iloc[3:5, 0:2])

print('View first two rows:')
d(main_df.head(2))

print('View last two rows:')
d(main_df.tail(2))

Index first column:


2018-01-01    0.149823
2018-01-02    0.778445
2018-01-03    0.136496
2018-01-04    0.351464
2018-01-05    0.811370
2018-01-06    0.576280
Freq: D, Name: Column1, dtype: float64

Index the first two rows:


Unnamed: 0,Column1,Column2,Column3
2018-01-01,0.149823,0.632892,0.029213
2018-01-02,0.778445,0.025646,0.305317


Index the rows by name:


Unnamed: 0,Column1,Column2,Column3
2018-01-01,0.149823,0.632892,0.029213
2018-01-02,0.778445,0.025646,0.305317
2018-01-03,0.136496,0.542492,0.360394
2018-01-04,0.351464,0.187041,0.624442


Index rows and columns by names:
Index rows and columns by numbers:


Unnamed: 0,Column1,Column2
2018-01-04,0.351464,0.187041
2018-01-05,0.81137,0.568316


View first two rows:


Unnamed: 0,Column1,Column2,Column3
2018-01-01,0.149823,0.632892,0.029213
2018-01-02,0.778445,0.025646,0.305317


View last two rows:


Unnamed: 0,Column1,Column2,Column3
2018-01-05,0.81137,0.568316,0.215886
2018-01-06,0.57628,0.215526,0.360219


## Add new Series to existing DataFrame

In [4]:
print('Add a new column')
main_df['Result'] = pd.Series([True, False, False, True, False, False], index=dates)
d(main_df)

Add a new column


Unnamed: 0,Column1,Column2,Column3,Result
2018-01-01,0.149823,0.632892,0.029213,True
2018-01-02,0.778445,0.025646,0.305317,False
2018-01-03,0.136496,0.542492,0.360394,False
2018-01-04,0.351464,0.187041,0.624442,True
2018-01-05,0.81137,0.568316,0.215886,False
2018-01-06,0.57628,0.215526,0.360219,False


## Statistics

In [5]:
main_df.describe()

Unnamed: 0,Column1,Column2,Column3
count,6.0,6.0,6.0
mean,0.467313,0.361985,0.315912
std,0.300238,0.25049,0.195676
min,0.136496,0.025646,0.029213
25%,0.200233,0.194162,0.238244
50%,0.463872,0.379009,0.332768
75%,0.727904,0.56186,0.360351
max,0.81137,0.632892,0.624442


## Sorting

In [6]:
print('Sort using the index:')
d(main_df.sort_index(axis=0, ascending=False))

print('Sort on a column:')
d(main_df.sort_values(by='Column2'))


Sort using the index:


Unnamed: 0,Column1,Column2,Column3,Result
2018-01-06,0.57628,0.215526,0.360219,False
2018-01-05,0.81137,0.568316,0.215886,False
2018-01-04,0.351464,0.187041,0.624442,True
2018-01-03,0.136496,0.542492,0.360394,False
2018-01-02,0.778445,0.025646,0.305317,False
2018-01-01,0.149823,0.632892,0.029213,True


Sort on a column:


Unnamed: 0,Column1,Column2,Column3,Result
2018-01-02,0.778445,0.025646,0.305317,False
2018-01-04,0.351464,0.187041,0.624442,True
2018-01-06,0.57628,0.215526,0.360219,False
2018-01-03,0.136496,0.542492,0.360394,False
2018-01-05,0.81137,0.568316,0.215886,False
2018-01-01,0.149823,0.632892,0.029213,True


## Join

If you want to join on a column other than the index, check out the `merge` method.

In [7]:
dates2 = pd.date_range("20180101", periods=7)
data2 = np.random.random((7,2))
column_names2 = ['Column4', 'Column5']

main_df2 = pd.DataFrame(data2, index=dates2, columns=column_names2)

print('Join two DataFrames:')
d(main_df.join(main_df2))

Join two DataFrames:


Unnamed: 0,Column1,Column2,Column3,Result,Column4,Column5
2018-01-01,0.149823,0.632892,0.029213,True,0.339986,0.202224
2018-01-02,0.778445,0.025646,0.305317,False,0.653846,0.16623
2018-01-03,0.136496,0.542492,0.360394,False,0.867343,0.050329
2018-01-04,0.351464,0.187041,0.624442,True,0.694528,0.793377
2018-01-05,0.81137,0.568316,0.215886,False,0.65816,0.473299
2018-01-06,0.57628,0.215526,0.360219,False,0.140099,0.939157


## Groupby

In [8]:
main_df.groupby('Result').mean()

Unnamed: 0_level_0,Column1,Column2,Column3
Result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.575648,0.337995,0.310454
True,0.250643,0.409967,0.326828


## Accessing to DataFrame attributes

In [9]:
print('Index:')
d(main_df.index)
print('Values:')
d(main_df.values)
print('Columns:')
d(main_df.columns)

Index:


DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

Values:


array([[0.14982252661361184, 0.6328924370662871, 0.02921317650215083, True],
       [0.7784450986865776, 0.02564550739722582, 0.30531739218227394, False],
       [0.13649588499092258, 0.542491623236581, 0.36039426758991144, False],
       [0.35146442943896694, 0.1870411903111191, 0.6244421196618678, True],
       [0.8113702213197799, 0.5683163740535471, 0.21588613154342595, False],
       [0.5762802987097136, 0.21552564547944753, 0.36021921459246664, False]], dtype=object)

Columns:


Index(['Column1', 'Column2', 'Column3', 'Result'], dtype='object')