In [1]:
import pandas as pd
import numpy as np
from IPython.display import display as d

## Series and DataFrame creation

In [2]:
print('Series with axis labels:')
d(pd.Series(np.arange(5, dtype=np.int)))

print('Series from dictionary:')
future_series = {0: 'A', 1: 'B', 2: 'C'}
d(pd.Series(future_series))

print('Basic dataframe with indexes and column names:')
dates = pd.date_range("20180101", periods=6)
data = np.random.random((6,3))
column_names = ['Column1', 'Column2', 'Column3']
main_df = pd.DataFrame(data, index=dates, columns=column_names)
d(main_df)

print('DataFrame from dictionary:')
basic_dict = {'Normal': ['A', 'B', 'C'], 'Reverse': ['Z', 'Y', 'X']}
d(pd.DataFrame(basic_dict))

print('DataFrame from list of dictionary:')
basic_dict = [
    {'Normal': 'A', 'Reverse': 'Z'},
    {'Normal': 'B', 'Reverse': 'Y'},
    {'Normal': 'C', 'Reverse': 'X'},
]
d(pd.DataFrame(basic_dict))

Series with axis labels:


0    0
1    1
2    2
3    3
4    4
dtype: int64

Series from dictionary:


0    A
1    B
2    C
dtype: object

Basic dataframe with indexes and column names:


Unnamed: 0,Column1,Column2,Column3
2018-01-01,0.865842,0.207336,0.287298
2018-01-02,0.910107,0.857314,0.224263
2018-01-03,0.497532,0.453515,0.985448
2018-01-04,0.433764,0.600994,0.31925
2018-01-05,0.779757,0.935117,0.238559
2018-01-06,0.867593,0.023837,0.579003


DataFrame from dictionary:


Unnamed: 0,Normal,Reverse
0,A,Z
1,B,Y
2,C,X


DataFrame from list of dictionary:


Unnamed: 0,Normal,Reverse
0,A,Z
1,B,Y
2,C,X


## Indexing

In [3]:
print('Index first column:')
d(main_df['Column1'])

print('Index the first two rows:')
d(main_df[0:2])

print('Index the rows by name:')
d(main_df['20180101':'20180104'])

print('Index rows and columns by names:')
main_df.loc['20180101':'20180103',['Column1','Column3']]

print('Index rows and columns by numbers:')
d(main_df.iloc[3:5, 0:2])

print('View first two rows:')
d(main_df.head(2))

print('View last two rows:')
d(main_df.tail(2))

Index first column:


2018-01-01    0.865842
2018-01-02    0.910107
2018-01-03    0.497532
2018-01-04    0.433764
2018-01-05    0.779757
2018-01-06    0.867593
Freq: D, Name: Column1, dtype: float64

Index the first two rows:


Unnamed: 0,Column1,Column2,Column3
2018-01-01,0.865842,0.207336,0.287298
2018-01-02,0.910107,0.857314,0.224263


Index the rows by name:


Unnamed: 0,Column1,Column2,Column3
2018-01-01,0.865842,0.207336,0.287298
2018-01-02,0.910107,0.857314,0.224263
2018-01-03,0.497532,0.453515,0.985448
2018-01-04,0.433764,0.600994,0.31925


Index rows and columns by names:
Index rows and columns by numbers:


Unnamed: 0,Column1,Column2
2018-01-04,0.433764,0.600994
2018-01-05,0.779757,0.935117


View first two rows:


Unnamed: 0,Column1,Column2,Column3
2018-01-01,0.865842,0.207336,0.287298
2018-01-02,0.910107,0.857314,0.224263


View last two rows:


Unnamed: 0,Column1,Column2,Column3
2018-01-05,0.779757,0.935117,0.238559
2018-01-06,0.867593,0.023837,0.579003


## Add new Series to existing DataFrame

In [4]:
print('Add a new column')
main_df['Result'] = pd.Series([True, False, False, True, False, False], index=dates)
d(main_df)

Add a new column


Unnamed: 0,Column1,Column2,Column3,Result
2018-01-01,0.865842,0.207336,0.287298,True
2018-01-02,0.910107,0.857314,0.224263,False
2018-01-03,0.497532,0.453515,0.985448,False
2018-01-04,0.433764,0.600994,0.31925,True
2018-01-05,0.779757,0.935117,0.238559,False
2018-01-06,0.867593,0.023837,0.579003,False


## Statistics

In [5]:
main_df.describe()

Unnamed: 0,Column1,Column2,Column3
count,6.0,6.0,6.0
mean,0.725766,0.513019,0.43897
std,0.206876,0.357974,0.29727
min,0.433764,0.023837,0.224263
25%,0.568088,0.268881,0.250744
50%,0.822799,0.527254,0.303274
75%,0.867155,0.793234,0.514065
max,0.910107,0.935117,0.985448


## Sorting

In [6]:
print('Sort using the index:')
d(main_df.sort_index(axis=0, ascending=False))

print('Sort on a column:')
d(main_df.sort_values(by='Column2'))


Sort using the index:


Unnamed: 0,Column1,Column2,Column3,Result
2018-01-06,0.867593,0.023837,0.579003,False
2018-01-05,0.779757,0.935117,0.238559,False
2018-01-04,0.433764,0.600994,0.31925,True
2018-01-03,0.497532,0.453515,0.985448,False
2018-01-02,0.910107,0.857314,0.224263,False
2018-01-01,0.865842,0.207336,0.287298,True


Sort on a column:


Unnamed: 0,Column1,Column2,Column3,Result
2018-01-06,0.867593,0.023837,0.579003,False
2018-01-01,0.865842,0.207336,0.287298,True
2018-01-03,0.497532,0.453515,0.985448,False
2018-01-04,0.433764,0.600994,0.31925,True
2018-01-02,0.910107,0.857314,0.224263,False
2018-01-05,0.779757,0.935117,0.238559,False


## Join

If you want to join on a column other than the index, check out the `merge` method.

In [7]:
dates2 = pd.date_range("20180101", periods=7)
data2 = np.random.random((7,2))
column_names2 = ['Column4', 'Column5']

main_df2 = pd.DataFrame(data2, index=dates2, columns=column_names2)

print('Join two DataFrames:')
d(main_df.join(main_df2))

Join two DataFrames:


Unnamed: 0,Column1,Column2,Column3,Result,Column4,Column5
2018-01-01,0.865842,0.207336,0.287298,True,0.96955,0.337664
2018-01-02,0.910107,0.857314,0.224263,False,0.002377,0.944277
2018-01-03,0.497532,0.453515,0.985448,False,0.276111,0.580262
2018-01-04,0.433764,0.600994,0.31925,True,0.053493,0.641503
2018-01-05,0.779757,0.935117,0.238559,False,0.83265,0.391235
2018-01-06,0.867593,0.023837,0.579003,False,0.958471,0.400906


## Groupby

In [8]:
main_df.groupby('Result').mean()

Unnamed: 0_level_0,Column1,Column2,Column3
Result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.763747,0.567446,0.506818
True,0.649803,0.404165,0.303274


## Accessing to DataFrame attributes

In [9]:
print('Index:')
d(main_df.index)
print('Values:')
d(main_df.values)
print('Columns:')
d(main_df.columns)

Index:


DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

Values:


array([[0.8658417055107381, 0.20733597572399842, 0.2872983523689635, True],
       [0.9101066960058164, 0.8573144690479608, 0.22426301179056174, False],
       [0.4975319079062607, 0.45351483301616224, 0.9854477497360844, False],
       [0.43376417681248913, 0.6009937058560123, 0.31925013590324003, True],
       [0.7797567728099666, 0.9351167125270917, 0.23855911775647587, False],
       [0.8675925937779295, 0.023837261678446686, 0.5790031224544956, False]], dtype=object)

Columns:


Index(['Column1', 'Column2', 'Column3', 'Result'], dtype='object')

## Transformation

In [13]:
main_df

Unnamed: 0,Column1,Column2,Column3,Result
2018-01-01,0.865842,0.207336,0.287298,True
2018-01-02,0.910107,0.857314,0.224263,False
2018-01-03,0.497532,0.453515,0.985448,False
2018-01-04,0.433764,0.600994,0.31925,True
2018-01-05,0.779757,0.935117,0.238559,False
2018-01-06,0.867593,0.023837,0.579003,False


In [14]:
# Increment Column1 by one
main_df['Column1'] = main_df['Column1'].apply(lambda x: x + 1)
main_df

Unnamed: 0,Column1,Column2,Column3,Result
2018-01-01,1.865842,0.207336,0.287298,True
2018-01-02,1.910107,0.857314,0.224263,False
2018-01-03,1.497532,0.453515,0.985448,False
2018-01-04,1.433764,0.600994,0.31925,True
2018-01-05,1.779757,0.935117,0.238559,False
2018-01-06,1.867593,0.023837,0.579003,False


In [15]:
# Examples of date transformations:
# df[‘MonthYear’] = pd.to_datetime(df[‘MonthYear’])
# df[‘MonthYear’] = df[‘MonthYear’].apply(lambda x: x.date())