# Pandas Basic Functionality

In [136]:
import numpy as np
np.random.seed(0)
import pandas as pd

In [137]:
def series_info(series: pd.Series) -> None:
    print(f"ndim: {series.ndim}")
    print(f"shape: {series.shape}")
    print(f"size: {series.size}")
    print(f"dtype: {series.dtype}")
    print(f"values:\n{series}\n")

In [138]:
def df_info(df: pd.DataFrame) -> None:
    print(f"ndim: {df.ndim}")
    print(f"shape: {df.shape}")
    print(f"size: {df.size}")
    print(f"dtype: {df.dtypes}")
    print(f"values:\n{df}\n")

In [139]:
s = pd.Series(
    [0, 0, 1, 2, 3],
    index=["a", "b", "c", "d", "e"]
)

series_info(s)

ndim: 1
shape: (5,)
size: 5
dtype: int64
values:
a    0
b    0
c    1
d    2
e    3
dtype: int64



In [140]:
print(s.values)

[0 0 1 2 3]


In [141]:
num_samples = 10
num_features = 3

In [142]:
index = pd.date_range(
    "21/4/2021",
    periods=num_samples
)

series_info(index)

ndim: 1
shape: (10,)
size: 10
dtype: datetime64[ns]
values:
DatetimeIndex(['2021-04-21', '2021-04-22', '2021-04-23', '2021-04-24',
               '2021-04-25', '2021-04-26', '2021-04-27', '2021-04-28',
               '2021-04-29', '2021-04-30'],
              dtype='datetime64[ns]', freq='D')



In [143]:
df = pd.DataFrame(
    np.random.randn(num_samples, num_features),
    index=index,
    columns=["Weight", "Temp", "Speed"]
)

df_info(df)

ndim: 2
shape: (10, 3)
size: 30
dtype: Weight    float64
Temp      float64
Speed     float64
dtype: object
values:
              Weight      Temp     Speed
2021-04-21  1.764052  0.400157  0.978738
2021-04-22  2.240893  1.867558 -0.977278
2021-04-23  0.950088 -0.151357 -0.103219
2021-04-24  0.410599  0.144044  1.454274
2021-04-25  0.761038  0.121675  0.443863
2021-04-26  0.333674  1.494079 -0.205158
2021-04-27  0.313068 -0.854096 -2.552990
2021-04-28  0.653619  0.864436 -0.742165
2021-04-29  2.269755 -1.454366  0.045759
2021-04-30 -0.187184  1.532779  1.469359



In [144]:
series_info(df['Weight'])

ndim: 1
shape: (10,)
size: 10
dtype: float64
values:
2021-04-21    1.764052
2021-04-22    2.240893
2021-04-23    0.950088
2021-04-24    0.410599
2021-04-25    0.761038
2021-04-26    0.333674
2021-04-27    0.313068
2021-04-28    0.653619
2021-04-29    2.269755
2021-04-30   -0.187184
Freq: D, Name: Weight, dtype: float64



In [145]:
print(type(df['Weight']))
print(np.sum(df['Weight']))  #np.sum can be directly called on a series, there is no need to typecast it manuelly to np array

print(type(df['Weight'].values))  #with manuel typecast to np array
print(np.sum(df['Weight'].values))

print(type((df['Weight'].array)))  #with manuel typecast to pandas wrapped np array
print(np.sum(df['Weight'].array))

<class 'pandas.core.series.Series'>
9.50960158820738
<class 'numpy.ndarray'>
9.50960158820738
<class 'pandas.core.arrays.numpy_.PandasArray'>
9.50960158820738


### Data Information

In [146]:
df.head(3)

Unnamed: 0,Weight,Temp,Speed
2021-04-21,1.764052,0.400157,0.978738
2021-04-22,2.240893,1.867558,-0.977278
2021-04-23,0.950088,-0.151357,-0.103219


In [147]:
df.tail(3)

Unnamed: 0,Weight,Temp,Speed
2021-04-28,0.653619,0.864436,-0.742165
2021-04-29,2.269755,-1.454366,0.045759
2021-04-30,-0.187184,1.532779,1.469359


### Attributes

In [148]:
print(df.columns)
df.columns = [x.lower() for x in df.columns]
print(df.columns)

Index(['Weight', 'Temp', 'Speed'], dtype='object')
Index(['weight', 'temp', 'speed'], dtype='object')


In [149]:
print(type(s.array))  #returns a pandas wrapped numpy array
print(s.array)

<class 'pandas.core.arrays.numpy_.PandasArray'>
<PandasArray>
[0, 0, 1, 2, 3]
Length: 5, dtype: int64


In [150]:
print(type(s.to_numpy()))  #same as .values but with options e.g. copy=True to force a copy
print(s.to_numpy())

<class 'numpy.ndarray'>
[0 0 1 2 3]


### Iterating over Series

In [151]:
print(df['weight'])

2021-04-21    1.764052
2021-04-22    2.240893
2021-04-23    0.950088
2021-04-24    0.410599
2021-04-25    0.761038
2021-04-26    0.333674
2021-04-27    0.313068
2021-04-28    0.653619
2021-04-29    2.269755
2021-04-30   -0.187184
Freq: D, Name: weight, dtype: float64


In [152]:
for weight in df['weight']:
    print(weight)

1.764052345967664
2.240893199201458
0.9500884175255894
0.41059850193837233
0.7610377251469934
0.33367432737426683
0.31306770165090136
0.6536185954403606
2.2697546239876076
-0.1871838500258336


In [153]:
s2 = df['weight']

In [154]:
for v1,v2 in s2.iteritems():  #return a interable (index, value) tuple
    print(v1, v2)

2021-04-21 00:00:00 1.764052345967664
2021-04-22 00:00:00 2.240893199201458
2021-04-23 00:00:00 0.9500884175255894
2021-04-24 00:00:00 0.41059850193837233
2021-04-25 00:00:00 0.7610377251469934
2021-04-26 00:00:00 0.33367432737426683
2021-04-27 00:00:00 0.31306770165090136
2021-04-28 00:00:00 0.6536185954403606
2021-04-29 00:00:00 2.2697546239876076
2021-04-30 00:00:00 -0.1871838500258336


### Iterating over Data Frames

In [155]:
print(df)

              weight      temp     speed
2021-04-21  1.764052  0.400157  0.978738
2021-04-22  2.240893  1.867558 -0.977278
2021-04-23  0.950088 -0.151357 -0.103219
2021-04-24  0.410599  0.144044  1.454274
2021-04-25  0.761038  0.121675  0.443863
2021-04-26  0.333674  1.494079 -0.205158
2021-04-27  0.313068 -0.854096 -2.552990
2021-04-28  0.653619  0.864436 -0.742165
2021-04-29  2.269755 -1.454366  0.045759
2021-04-30 -0.187184  1.532779  1.469359


In [156]:
for v in df:  #default access methode is per column! (not row like numpy)
    print(v)

weight
temp
speed


In [157]:
for key, val in df.iterrows():  #returns a (index, series) tuple per row
    print(type(key))
    print(type(val))
    print(key, val, "\n")

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas.core.series.Series'>
2021-04-21 00:00:00 weight    1.764052
temp      0.400157
speed     0.978738
Name: 2021-04-21 00:00:00, dtype: float64 

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas.core.series.Series'>
2021-04-22 00:00:00 weight    2.240893
temp      1.867558
speed    -0.977278
Name: 2021-04-22 00:00:00, dtype: float64 

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas.core.series.Series'>
2021-04-23 00:00:00 weight    0.950088
temp     -0.151357
speed    -0.103219
Name: 2021-04-23 00:00:00, dtype: float64 

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas.core.series.Series'>
2021-04-24 00:00:00 weight    0.410599
temp      0.144044
speed     1.454274
Name: 2021-04-24 00:00:00, dtype: float64 

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas.core.series.Series'>
2021-04-25 00:00:00 weight    0.761038
temp      0.121675
speed     0.443863
Name

#### Indexing

``.loc`` is primarily label based, but may also be used with a boolean array. .loc will raise KeyError when the items are not found.

``.iloc`` is primarily integer position based (from 0 to length-1 of the axis), but may also be used with a boolean array. .iloc will raise IndexError if a requested indexer is out-of-bounds.

| Object Type | Indexers |
|-|-|
| Series | s.loc\[indexer\] |
| DataFrame | df.loc\[row_indexer,column_indexer\] |

##### Standard Indexing

| Object Type | Selection | Return Value Type |
|-|-|-|
| Series | series\[label\] | scalar value |
| DataFrame | frame\[colname\] | Series corresponding to colname |


In [158]:
s = df['weight']  # create a series from dataframe

print(s)

2021-04-21    1.764052
2021-04-22    2.240893
2021-04-23    0.950088
2021-04-24    0.410599
2021-04-25    0.761038
2021-04-26    0.333674
2021-04-27    0.313068
2021-04-28    0.653619
2021-04-29    2.269755
2021-04-30   -0.187184
Freq: D, Name: weight, dtype: float64


In [159]:
# series is one dimensional and allows direct indexing without .loc methodes
print(s[0])
print(s['2021-04-21'])

# with specifically using the indexer methodes
print(s.iloc[0])
print(s.loc['2021-04-21'])

1.764052345967664
1.764052345967664
1.764052345967664
1.764052345967664


In [160]:
print(df.head(3))

# data frame is 2d and allows direct indexing of whole colums (not rows)
df[["weight", "temp"]] = df[["temp", "weight"]]

print(df.head(3))

              weight      temp     speed
2021-04-21  1.764052  0.400157  0.978738
2021-04-22  2.240893  1.867558 -0.977278
2021-04-23  0.950088 -0.151357 -0.103219
              weight      temp     speed
2021-04-21  0.400157  1.764052  0.978738
2021-04-22  1.867558  2.240893 -0.977278
2021-04-23 -0.151357  0.950088 -0.103219


In [161]:
print(df.loc['2021-04-22':'2021-04-25'])  #only row indexer given, returns all columns for each row

              weight      temp     speed
2021-04-22  1.867558  2.240893 -0.977278
2021-04-23 -0.151357  0.950088 -0.103219
2021-04-24  0.144044  0.410599  1.454274
2021-04-25  0.121675  0.761038  0.443863


In [162]:
print(df.loc['2021-04-22':'2021-04-25', 'weight']) #row and column indexer given

2021-04-22    1.867558
2021-04-23   -0.151357
2021-04-24    0.144044
2021-04-25    0.121675
Freq: D, Name: weight, dtype: float64


In [163]:
print(df.loc['2021-04-22':'2021-04-25']['weight']) #same as above but inefficient, it returns temporary all clomuns than use colum first index access

2021-04-22    1.867558
2021-04-23   -0.151357
2021-04-24    0.144044
2021-04-25    0.121675
Freq: D, Name: weight, dtype: float64


In [164]:
print(df.iloc[1:5])  #iloc the upper bound is exclusive, while in loc the upper bound is inclusive

              weight      temp     speed
2021-04-22  1.867558  2.240893 -0.977278
2021-04-23 -0.151357  0.950088 -0.103219
2021-04-24  0.144044  0.410599  1.454274
2021-04-25  0.121675  0.761038  0.443863


In [165]:
print(df.iloc[1:5, 0])

2021-04-22    1.867558
2021-04-23   -0.151357
2021-04-24    0.144044
2021-04-25    0.121675
Freq: D, Name: weight, dtype: float64
