In [1]:
import pandas as pd
import numpy as np

### Working with Series

In [2]:
my_series = pd.Series(data=[1,2,3,4,6,4], index=['a','b','c','d','f','g'])
my_series

a    1
b    2
c    3
d    4
f    6
g    4
dtype: int64

In [3]:
temp_dict = { 'a': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8}
dict_series = pd.Series(temp_dict)
dict_series

a    3
d    4
e    5
f    6
g    7
h    8
dtype: int64

In [4]:
my_series['f'] #Accessing values

6

In [5]:
my_series + dict_series # Merge the series, adds the common items, for non common items data is populated as NaN

a     4.0
b     NaN
c     NaN
d     8.0
e     NaN
f    12.0
g    11.0
h     NaN
dtype: float64

### Working with Dataframes

In [6]:
my_array = np.random.randn(3,4)
my_array_df = pd.DataFrame(my_array, index=['R1', 'R2', 'R3'], columns=['C1', 'C2', 'C3', 'C4'])
my_array_df

Unnamed: 0,C1,C2,C3,C4
R1,0.678249,-0.793067,-0.233596,0.699667
R2,-0.736123,-0.023971,0.495662,-0.597212
R3,-1.013731,-1.044321,0.613034,0.621653


In [7]:
new_df = pd.DataFrame(
    {
        'A': [1, 2, 3, 4],
        'B': pd.Timestamp('20230201'), # Single value gets repeated
        'C': np.array([5, 6, 7, 8]), # All arrays must be of same length
        'D': 'Test'
    }
)
new_df

Unnamed: 0,A,B,C,D
0,1,2023-02-01,5,Test
1,2,2023-02-01,6,Test
2,3,2023-02-01,7,Test
3,4,2023-02-01,8,Test


### Selection of rows and columns in Dataframes

In [8]:
my_array_df['C1']

R1    0.678249
R2   -0.736123
R3   -1.013731
Name: C1, dtype: float64

In [9]:
my_array_df.C1

R1    0.678249
R2   -0.736123
R3   -1.013731
Name: C1, dtype: float64

In [10]:
my_array_df.loc['R1']

C1    0.678249
C2   -0.793067
C3   -0.233596
C4    0.699667
Name: R1, dtype: float64

In [11]:
my_array_df.iloc[0]

C1    0.678249
C2   -0.793067
C3   -0.233596
C4    0.699667
Name: R1, dtype: float64

### Creating sub dataframes

In [12]:
my_array_df[['C2','C3']]

Unnamed: 0,C2,C3
R1,-0.793067,-0.233596
R2,-0.023971,0.495662
R3,-1.044321,0.613034


In [13]:
my_array_df.loc[['R1', 'R2'], ['C1','C2']]

Unnamed: 0,C1,C2
R1,0.678249,-0.793067
R2,-0.736123,-0.023971


In [14]:
my_array_df.loc[['R1', 'R2'], ['C2','C3']]

Unnamed: 0,C2,C3
R1,-0.793067,-0.233596
R2,-0.023971,0.495662


In [15]:
my_array_df.loc[['R2', 'R3'], ['C1','C2']]

Unnamed: 0,C1,C2
R2,-0.736123,-0.023971
R3,-1.013731,-1.044321


In [16]:
my_array_df.loc[['R2', 'R3'], ['C2','C3']]

Unnamed: 0,C2,C3
R2,-0.023971,0.495662
R3,-1.044321,0.613034


### Adding new columns to the Dataframe

In [17]:
my_array_df['C5'] = my_array_df['C1'] * my_array_df['C3'] # Adds a new column containing the product of C1 & C3
my_array_df 

Unnamed: 0,C1,C2,C3,C4,C5
R1,0.678249,-0.793067,-0.233596,0.699667,-0.158436
R2,-0.736123,-0.023971,0.495662,-0.597212,-0.364868
R3,-1.013731,-1.044321,0.613034,0.621653,-0.621452


### Dropping existing columns from Dataframe

In [18]:
my_array_df.drop('C2', axis=1) # By default the axis is 0, which will look for the key in rows. 
# my_array_df.drop('C2', axis=1, inplace=True ) # Add inplace=True in case to persist the drop.

Unnamed: 0,C1,C3,C4,C5
R1,0.678249,-0.233596,0.699667,-0.158436
R2,-0.736123,0.495662,-0.597212,-0.364868
R3,-1.013731,0.613034,0.621653,-0.621452


In [19]:
my_array_df

Unnamed: 0,C1,C2,C3,C4,C5
R1,0.678249,-0.793067,-0.233596,0.699667,-0.158436
R2,-0.736123,-0.023971,0.495662,-0.597212,-0.364868
R3,-1.013731,-1.044321,0.613034,0.621653,-0.621452


### Conditional Selections

In [20]:
my_array_df > 0

Unnamed: 0,C1,C2,C3,C4,C5
R1,True,False,False,True,False
R2,False,False,True,False,False
R3,False,False,True,True,False


In [21]:
my_array_df[my_array_df > 0]

Unnamed: 0,C1,C2,C3,C4,C5
R1,0.678249,,,0.699667,
R2,,,0.495662,,
R3,,,0.613034,0.621653,


In [22]:
my_array_df[my_array_df['C3'] > 0] # Selects entire row in which C3 satisfies the condition

Unnamed: 0,C1,C2,C3,C4,C5
R2,-0.736123,-0.023971,0.495662,-0.597212,-0.364868
R3,-1.013731,-1.044321,0.613034,0.621653,-0.621452


In [23]:
my_array_df[(my_array_df['C3'] > 0) | (my_array_df['C5'] > 0)]

Unnamed: 0,C1,C2,C3,C4,C5
R2,-0.736123,-0.023971,0.495662,-0.597212,-0.364868
R3,-1.013731,-1.044321,0.613034,0.621653,-0.621452


### Setting new index for the dataframe

In [24]:
my_new_index = ['Row1', 'Row2', 'Row3']
my_array_df['NewIndex'] = my_new_index

In [25]:
my_temp_df = my_array_df.set_index('NewIndex')
my_temp_df

Unnamed: 0_level_0,C1,C2,C3,C4,C5
NewIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Row1,0.678249,-0.793067,-0.233596,0.699667,-0.158436
Row2,-0.736123,-0.023971,0.495662,-0.597212,-0.364868
Row3,-1.013731,-1.044321,0.613034,0.621653,-0.621452


In [26]:
my_array_df

Unnamed: 0,C1,C2,C3,C4,C5,NewIndex
R1,0.678249,-0.793067,-0.233596,0.699667,-0.158436,Row1
R2,-0.736123,-0.023971,0.495662,-0.597212,-0.364868,Row2
R3,-1.013731,-1.044321,0.613034,0.621653,-0.621452,Row3


### Dataframe to Numpy

In [27]:
my_array_df.to_numpy()

array([[0.6782485966901538, -0.7930672212593117, -0.23359592926913325,
        0.6996669716119309, -0.15843611121932205, 'Row1'],
       [-0.7361233261529367, -0.023971464890581377, 0.4956621609736444,
        -0.5972116028543167, -0.36486847858407145, 'Row2'],
       [-1.0137306513741446, -1.044320861045783, 0.6130341515626815,
        0.6216530013263842, -0.6214515097782332, 'Row3']], dtype=object)