# Pandas Recap

In [2]:
import numpy as np
import pandas as pd # common way to import pandas

In [3]:
dates = pd.date_range('20200101', periods=6) # index for 6 days starting with 2020-01-01
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) #row index as dates, and columns as category 
df

Unnamed: 0,A,B,C,D
2020-01-01,-0.026799,-0.4651,-0.138908,-1.20547
2020-01-02,-0.298531,-0.23448,0.482808,-0.229483
2020-01-03,0.117675,0.987121,-0.724561,-1.143854
2020-01-04,1.484587,0.502214,-0.893326,0.292985
2020-01-05,-0.5067,-0.703708,-0.048217,0.172001
2020-01-06,-0.159846,-0.421431,-1.129189,0.886991


## Selecting

When indexing with brackets, there are two ways!

In [5]:
df["A"] # selecting a single column

2020-01-01   -0.026799
2020-01-02   -0.298531
2020-01-03    0.117675
2020-01-04    1.484587
2020-01-05   -0.506700
2020-01-06   -0.159846
Freq: D, Name: A, dtype: float64

In [6]:
df[["A","B"]] # selecting multiple columns with list of column indexes

Unnamed: 0,A,B
2020-01-01,-0.026799,-0.4651
2020-01-02,-0.298531,-0.23448
2020-01-03,0.117675,0.987121
2020-01-04,1.484587,0.502214
2020-01-05,-0.5067,-0.703708
2020-01-06,-0.159846,-0.421431


In [10]:
df["2020-01-01":"2020-01-02"] # selecting a range of rows

Unnamed: 0,A,B,C,D
2020-01-01,-0.026799,-0.4651,-0.138908,-1.20547
2020-01-02,-0.298531,-0.23448,0.482808,-0.229483


In [11]:
#df["A":"B"]

With `.loc[]`, we can select values by "custom" index, in a numpy way

In [13]:
df.loc["2020-01-01":"2020-01-03", :] #include the end index!

Unnamed: 0,A,B,C,D
2020-01-01,-0.026799,-0.4651,-0.138908,-1.20547
2020-01-02,-0.298531,-0.23448,0.482808,-0.229483
2020-01-03,0.117675,0.987121,-0.724561,-1.143854


`iloc()` does the same, but by range index

In [15]:
df.iloc[0:3,:] # doesnt include end index!

Unnamed: 0,A,B,C,D
2020-01-01,-0.026799,-0.4651,-0.138908,-1.20547
2020-01-02,-0.298531,-0.23448,0.482808,-0.229483
2020-01-03,0.117675,0.987121,-0.724561,-1.143854


In [19]:
df.at[dates[0],"B"]

-0.46509990082993

In [26]:
dates[0]

Timestamp('2020-01-01 00:00:00', freq='D')

In [20]:
df.at[df.index[0], "A"] #can only select single values with this

-0.026799499382003374

With .at() we can select only 1 value

In [22]:
#df.iat[0, "A"] # only works for integer indexers!

In [23]:
df.iat[2,2]

-0.7245605878257391

In [24]:
df_excel = pd.read_excel('excel_example.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

In [25]:
df_excel.iat[2,2] # works here again

'Gent'

In [29]:
df_excel.Country.value_counts()

United States    480
Great Britain    280
France           240
Name: Country, dtype: int64