In [2]:
import pandas as pd
import numpy as np

In [3]:
# Pandas Series

series = pd.Series([1,2,3,4,5])
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [4]:
# Create using specified index
series = pd.Series([1,2,3,4,5], index=['a','b','c','d','c'])
print(series)

a    1
b    2
c    3
d    4
c    5
dtype: int64


In [5]:
# Accessing elements in a series
print(series[2])
print(series.iloc[2])

3
3


In [6]:
print(series['d'])
print(series.loc['d'])

4
4


In [7]:
print(series['c'])

c    3
c    5
dtype: int64


In [8]:
print(series[2:])
print(series.iloc[2:])

c    3
d    4
c    5
dtype: int64
c    3
d    4
c    5
dtype: int64


In [9]:
# Specifying a datetime range as the index of a series
dates1 = pd.date_range('20190525', periods=12)
print(dates1)

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28',
               '2019-05-29', '2019-05-30', '2019-05-31', '2019-06-01',
               '2019-06-02', '2019-06-03', '2019-06-04', '2019-06-05'],
              dtype='datetime64[ns]', freq='D')


In [10]:
series = pd.Series([1,2,3,4,5,6,7,8,9,10,11,12])
series.index = dates1
print(series)

2019-05-25     1
2019-05-26     2
2019-05-27     3
2019-05-28     4
2019-05-29     5
2019-05-30     6
2019-05-31     7
2019-06-01     8
2019-06-02     9
2019-06-03    10
2019-06-04    11
2019-06-05    12
Freq: D, dtype: int64


In [11]:
# change the frequency parameter to month. will create datetime index of month intervals

dates2 = pd.date_range('2019-05-01', periods=12, freq='M')
print(dates2)

DatetimeIndex(['2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31',
               '2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30'],
              dtype='datetime64[ns]', freq='M')


In [12]:
# start date paramter doesn't require hyphens
dates_test = pd.date_range('20190501', periods=12, freq='M')
dates_test

DatetimeIndex(['2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31',
               '2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30'],
              dtype='datetime64[ns]', freq='M')

In [13]:
# MS as in month start freq
dates2 = pd.date_range('20190501', periods=12, freq='MS')
print(dates2)

DatetimeIndex(['2019-05-01', '2019-06-01', '2019-07-01', '2019-08-01',
               '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
               '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
              dtype='datetime64[ns]', freq='MS')


In [14]:
# can also set time: (in hours here)

dates3 = pd.date_range('20190517 09:00:00', periods=8, freq='H')
dates3

DatetimeIndex(['2019-05-17 09:00:00', '2019-05-17 10:00:00',
               '2019-05-17 11:00:00', '2019-05-17 12:00:00',
               '2019-05-17 13:00:00', '2019-05-17 14:00:00',
               '2019-05-17 15:00:00', '2019-05-17 16:00:00'],
              dtype='datetime64[ns]', freq='H')

In [15]:
# Pandas Dataframe with dimension 10x4, 10 rows and 4 columns
df = pd.DataFrame(np.random.randn(10,4),
                 columns=list('ABCD'))
print(df)

          A         B         C         D
0  0.415294 -0.454378 -1.043012 -0.307626
1 -1.126547 -0.279780  0.202844 -0.838109
2 -0.162302 -1.297906  0.132521 -1.000656
3 -0.624140 -1.297406  0.324193  1.377802
4 -1.194187  0.403221 -1.021294  0.050534
5 -0.727682  0.058076 -0.642380 -0.434955
6 -0.293408 -1.436055  0.532461 -0.941544
7 -1.224564 -0.635031 -1.434794  0.734527
8  0.138088 -0.016465  0.359906 -1.724983
9  1.247254 -1.378253  0.690674 -1.294409


In [16]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,A,B,C,D
0,0.187497,1.12215,-0.988277,-1.985934
1,0.360803,-0.562243,-0.340693,-0.986988
2,-0.040627,0.067333,-0.452978,0.686223
3,-0.279572,-0.702492,0.252265,0.958977
4,0.537438,-1.737568,0.714727,-0.939288


In [19]:
# Specifying the index in a df

days = pd.date_range('20190525', periods=10)
df.index = days
df

Unnamed: 0,A,B,C,D
2019-05-25,0.187497,1.12215,-0.988277,-1.985934
2019-05-26,0.360803,-0.562243,-0.340693,-0.986988
2019-05-27,-0.040627,0.067333,-0.452978,0.686223
2019-05-28,-0.279572,-0.702492,0.252265,0.958977
2019-05-29,0.537438,-1.737568,0.714727,-0.939288
2019-05-30,0.070011,-0.516443,-1.655689,0.246721
2019-05-31,0.001268,0.951517,2.10736,-0.108726
2019-06-01,-0.185258,0.85652,-0.686285,1.104195
2019-06-02,0.387023,1.706336,-2.452653,0.260466
2019-06-03,-1.054974,0.556775,-0.945219,-0.030295


In [20]:
print(df.index)

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28',
               '2019-05-29', '2019-05-30', '2019-05-31', '2019-06-01',
               '2019-06-02', '2019-06-03'],
              dtype='datetime64[ns]', freq='D')


In [21]:
print(df.values)

[[ 1.874970e-01  1.122150e+00 -9.882770e-01 -1.985934e+00]
 [ 3.608030e-01 -5.622430e-01 -3.406930e-01 -9.869880e-01]
 [-4.062700e-02  6.733300e-02 -4.529780e-01  6.862230e-01]
 [-2.795720e-01 -7.024920e-01  2.522650e-01  9.589770e-01]
 [ 5.374380e-01 -1.737568e+00  7.147270e-01 -9.392880e-01]
 [ 7.001100e-02 -5.164430e-01 -1.655689e+00  2.467210e-01]
 [ 1.268000e-03  9.515170e-01  2.107360e+00 -1.087260e-01]
 [-1.852580e-01  8.565200e-01 -6.862850e-01  1.104195e+00]
 [ 3.870230e-01  1.706336e+00 -2.452653e+00  2.604660e-01]
 [-1.054974e+00  5.567750e-01 -9.452190e-01 -3.029500e-02]]


In [25]:
# Get descriptive statistics
print(df.describe())
print(df.mean(0)) # columns
print(df.mean(1)) # rows


               A          B          C          D
count  10.000000  10.000000  10.000000  10.000000
mean   -0.001639   0.174188  -0.444744  -0.079465
std     0.451656   1.049677   1.267397   0.971164
min    -1.054974  -1.737568  -2.452653  -1.985934
25%    -0.149100  -0.550793  -0.977513  -0.731648
50%     0.035640   0.312054  -0.569632   0.108213
75%     0.317476   0.927768   0.104025   0.579784
max     0.537438   1.706336   2.107360   1.104195
A   -0.001639
B    0.174188
C   -0.444744
D   -0.079465
dtype: float64
2019-05-25   -0.416141
2019-05-26   -0.382280
2019-05-27    0.064988
2019-05-28    0.057294
2019-05-29   -0.356173
2019-05-30   -0.463850
2019-05-31    0.737855
2019-06-01    0.272293
2019-06-02   -0.024707
2019-06-03   -0.368428
Freq: D, dtype: float64


### Extracting from dataframes:
.head(), .tail()

In [26]:
print(df.head())

                   A         B         C         D
2019-05-25  0.187497  1.122150 -0.988277 -1.985934
2019-05-26  0.360803 -0.562243 -0.340693 -0.986988
2019-05-27 -0.040627  0.067333 -0.452978  0.686223
2019-05-28 -0.279572 -0.702492  0.252265  0.958977
2019-05-29  0.537438 -1.737568  0.714727 -0.939288


In [27]:
print(df.tail())

                   A         B         C         D
2019-05-30  0.070011 -0.516443 -1.655689  0.246721
2019-05-31  0.001268  0.951517  2.107360 -0.108726
2019-06-01 -0.185258  0.856520 -0.686285  1.104195
2019-06-02  0.387023  1.706336 -2.452653  0.260466
2019-06-03 -1.054974  0.556775 -0.945219 -0.030295


In [34]:
# selecting a specific column. returns a series
print(df['A'])
print(df.A)

2019-05-25    0.187497
2019-05-26    0.360803
2019-05-27   -0.040627
2019-05-28   -0.279572
2019-05-29    0.537438
2019-05-30    0.070011
2019-05-31    0.001268
2019-06-01   -0.185258
2019-06-02    0.387023
2019-06-03   -1.054974
Freq: D, Name: A, dtype: float64
2019-05-25    0.187497
2019-05-26    0.360803
2019-05-27   -0.040627
2019-05-28   -0.279572
2019-05-29    0.537438
2019-05-30    0.070011
2019-05-31    0.001268
2019-06-01   -0.185258
2019-06-02    0.387023
2019-06-03   -1.054974
Freq: D, Name: A, dtype: float64


In [35]:
# Double brackets because passing in a LIST of column labels. Instead of a list, you get a dataframe.

print(df[['A','B']])

                   A         B
2019-05-25  0.187497  1.122150
2019-05-26  0.360803 -0.562243
2019-05-27 -0.040627  0.067333
2019-05-28 -0.279572 -0.702492
2019-05-29  0.537438 -1.737568
2019-05-30  0.070011 -0.516443
2019-05-31  0.001268  0.951517
2019-06-01 -0.185258  0.856520
2019-06-02  0.387023  1.706336
2019-06-03 -1.054974  0.556775


In [41]:
# Slicing based on row number. this extracts row number 2 thru 4 (not inclusive)
print(df[2:4])
print(df.iloc[2:4])
# return only rows 2 and 4:
print(df.iloc[[2,4]])
# print row 2:
print(df.iloc[2])

                   A         B         C         D
2019-05-27 -0.040627  0.067333 -0.452978  0.686223
2019-05-28 -0.279572 -0.702492  0.252265  0.958977
                   A         B         C         D
2019-05-27 -0.040627  0.067333 -0.452978  0.686223
2019-05-28 -0.279572 -0.702492  0.252265  0.958977
                   A         B         C         D
2019-05-27 -0.040627  0.067333 -0.452978  0.686223
2019-05-29  0.537438 -1.737568  0.714727 -0.939288
A   -0.040627
B    0.067333
C   -0.452978
D    0.686223
Name: 2019-05-27 00:00:00, dtype: float64
