In [1]:
import pandas as pd
import numpy as np

### Working with Series

In [2]:
my_series = pd.Series(data=[1,2,3,4,6,4], index=['a','b','c','d','f','g'])
my_series

a    1
b    2
c    3
d    4
f    6
g    4
dtype: int64

In [3]:
temp_dict = { 'a': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8}
dict_series = pd.Series(temp_dict)
dict_series

a    3
d    4
e    5
f    6
g    7
h    8
dtype: int64

In [4]:
my_series['f'] #Accessing values

6

In [5]:
my_series + dict_series # Merge the series, adds the common items, for non common items data is populated as NaN

a     4.0
b     NaN
c     NaN
d     8.0
e     NaN
f    12.0
g    11.0
h     NaN
dtype: float64

### Working with Dataframes

In [6]:
my_array = np.random.randn(3,4)
my_array_df = pd.DataFrame(my_array, index=['R1', 'R2', 'R3'], columns=['C1', 'C2', 'C3', 'C4'])
my_array_df

Unnamed: 0,C1,C2,C3,C4
R1,-0.236246,-0.489684,0.570755,-0.424358
R2,0.336214,-0.753016,-2.565319,-0.203383
R3,-0.534497,-0.873508,-1.616074,-0.14325


In [41]:
new_df = pd.DataFrame(
    {
        'A': [1, 2, 3, 4],
        'B': pd.Timestamp('20230201'), # Single value gets repeated
        'C': np.array([5, 6, 7, 8]), # All arrays must be of same length
        'D': 'Test'
    }
)
new_df

Unnamed: 0,A,B,C,D
0,1,2023-02-01,5,Test
1,2,2023-02-01,6,Test
2,3,2023-02-01,7,Test
3,4,2023-02-01,8,Test


### Selection of rows and columns in Dataframes

In [7]:
my_array_df['C1']

R1   -0.236246
R2    0.336214
R3   -0.534497
Name: C1, dtype: float64

In [39]:
my_array_df.C1

R1   -0.236246
R2    0.336214
R3   -0.534497
Name: C1, dtype: float64

In [8]:
my_array_df.loc['R1']

C1   -0.236246
C2   -0.489684
C3    0.570755
C4   -0.424358
Name: R1, dtype: float64

In [9]:
my_array_df.iloc[0]

C1   -0.236246
C2   -0.489684
C3    0.570755
C4   -0.424358
Name: R1, dtype: float64

### Creating sub dataframes

In [10]:
my_array_df[['C2','C3']]

Unnamed: 0,C2,C3
R1,-0.489684,0.570755
R2,-0.753016,-2.565319
R3,-0.873508,-1.616074


In [11]:
my_array_df.loc[['R1', 'R2'], ['C1','C2']]

Unnamed: 0,C1,C2
R1,-0.236246,-0.489684
R2,0.336214,-0.753016


In [12]:
my_array_df.loc[['R1', 'R2'], ['C2','C3']]

Unnamed: 0,C2,C3
R1,-0.489684,0.570755
R2,-0.753016,-2.565319


In [13]:
my_array_df.loc[['R2', 'R3'], ['C1','C2']]

Unnamed: 0,C1,C2
R2,0.336214,-0.753016
R3,-0.534497,-0.873508


In [14]:
my_array_df.loc[['R2', 'R3'], ['C2','C3']]

Unnamed: 0,C2,C3
R2,-0.753016,-2.565319
R3,-0.873508,-1.616074


### Adding new columns to the Dataframe

In [15]:
my_array_df['C5'] = my_array_df['C1'] * my_array_df['C3'] # Adds a new column containing the product of C1 & C3
my_array_df 

Unnamed: 0,C1,C2,C3,C4,C5
R1,-0.236246,-0.489684,0.570755,-0.424358,-0.134839
R2,0.336214,-0.753016,-2.565319,-0.203383,-0.862495
R3,-0.534497,-0.873508,-1.616074,-0.14325,0.863786


### Dropping existing columns from Dataframe

In [16]:
my_array_df.drop('C2', axis=1) # By default the axis is 0, which will look for the key in rows. 
# my_array_df.drop('C2', axis=1, inplace=True ) # Add inplace=True in case to persist the drop.

Unnamed: 0,C1,C3,C4,C5
R1,-0.236246,0.570755,-0.424358,-0.134839
R2,0.336214,-2.565319,-0.203383,-0.862495
R3,-0.534497,-1.616074,-0.14325,0.863786


In [17]:
my_array_df

Unnamed: 0,C1,C2,C3,C4,C5
R1,-0.236246,-0.489684,0.570755,-0.424358,-0.134839
R2,0.336214,-0.753016,-2.565319,-0.203383,-0.862495
R3,-0.534497,-0.873508,-1.616074,-0.14325,0.863786


### Conditional Selections

In [18]:
my_array_df > 0

Unnamed: 0,C1,C2,C3,C4,C5
R1,False,False,True,False,False
R2,True,False,False,False,False
R3,False,False,False,False,True


In [19]:
my_array_df[my_array_df > 0]

Unnamed: 0,C1,C2,C3,C4,C5
R1,,,0.570755,,
R2,0.336214,,,,
R3,,,,,0.863786


In [20]:
my_array_df[my_array_df['C3'] > 0] # Selects entire row in which C3 satisfies the condition

Unnamed: 0,C1,C2,C3,C4,C5
R1,-0.236246,-0.489684,0.570755,-0.424358,-0.134839


In [28]:
my_array_df[(my_array_df['C3'] > 0) | (my_array_df['C5'] > 0)]

Unnamed: 0,C1,C2,C3,C4,C5
R1,-0.236246,-0.489684,0.570755,-0.424358,-0.134839
R3,-0.534497,-0.873508,-1.616074,-0.14325,0.863786


### Setting new index for the dataframe

In [31]:
my_new_index = ['Row1', 'Row2', 'Row3']
my_array_df['NewIndex'] = my_new_index

In [37]:
my_temp_df = my_array_df.set_index('NewIndex')
my_temp_df

Unnamed: 0_level_0,C1,C2,C3,C4,C5
NewIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Row1,-0.236246,-0.489684,0.570755,-0.424358,-0.134839
Row2,0.336214,-0.753016,-2.565319,-0.203383,-0.862495
Row3,-0.534497,-0.873508,-1.616074,-0.14325,0.863786


In [38]:
my_array_df

Unnamed: 0,C1,C2,C3,C4,C5,NewIndex
R1,-0.236246,-0.489684,0.570755,-0.424358,-0.134839,Row1
R2,0.336214,-0.753016,-2.565319,-0.203383,-0.862495,Row2
R3,-0.534497,-0.873508,-1.616074,-0.14325,0.863786,Row3


### Dataframe to Numpy

In [40]:
my_array_df.to_numpy()

array([[-0.23624580598559836, -0.4896835115854099, 0.5707554559806883,
        -0.4243577012014257, -0.13483858271883542, 'Row1'],
       [0.3362135238310809, -0.7530155439287666, -2.565318570907581,
        -0.20338326510122043, -0.8624947964741504, 'Row2'],
       [-0.5344965035084948, -0.8735078513742108, -1.6160740077667615,
        -0.14324979331880902, 0.863785906562294, 'Row3']], dtype=object)