Pandas Operations: A high level overview of options 
---

In [1]:
import pandas as pd

In [3]:
df = pd.DataFrame({
    
    'col1' : [1,2,3,4],
    'col2' : [444,555,666,444],
    'col3' : ['abc', 'def', 'ghi', 'xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [7]:
# Let's look at some column attributes to figure out unique values

df['col2'].unique # in vertical list format 
df['col2'].unique() # in array format

array([444, 555, 666])

In [6]:
df['col2'].nunique() # This will show the number of unique values

3

In [10]:
df['col2'].value_counts() # How many times values occur

444    2
555    1
666    1
Name: col2, dtype: int64

Select Data using conditionals across multiple columns

In [15]:
# Col 1 > 2
# Col 2 == 444

newdf_or = df[(df['col1'] > 2) | (df['col2'] == 444)]
newdf_or

Unnamed: 0,col1,col2,col3
0,1,444,abc
2,3,666,ghi
3,4,444,xyz


In [16]:
newdf_and = df[(df['col1'] > 2) & (df['col2'] == 444)]
newdf_and

Unnamed: 0,col1,col2,col3
3,4,444,xyz


In [22]:
# Create a simple function

def times_two(number):
    return number * 2

In [29]:
# Create a mapping function

def map_two(number):
    return number ** 2

In [41]:
#To apply this function to every column in a dataframe

df_apply = df['col1'].apply(times_two)

In [42]:
df_map = df['col1'].map(map_two)

In [43]:
df['apply'] =  df_apply

In [44]:
df['map'] = df_map

In [46]:
df # Here we added two new columns to the data frame the first with apply and the second with map

Unnamed: 0,col1,col2,col3,apply,map
0,1,444,abc,2,1
1,2,555,def,4,4
2,3,666,ghi,6,9
3,4,444,xyz,8,16


In [49]:
del df['apply']

In [50]:
df

Unnamed: 0,col1,col2,col3,map
0,1,444,abc,1
1,2,555,def,4
2,3,666,ghi,9
3,4,444,xyz,16


In [51]:
del df['map']

In [52]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [58]:
df.columns #columns is an attribute and not a method so no parenthesis

Index(['col1', 'col2', 'col3'], dtype='object')

In [60]:
df.index # attribute

RangeIndex(start=0, stop=4, step=1)

In [57]:
df.info() # Useful for memory and datatype information similar to describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    4 non-null      int64 
 1   col2    4 non-null      int64 
 2   col3    4 non-null      object
dtypes: int64(2), object(1)
memory usage: 224.0+ bytes


In [61]:
df.sort_values(by = 'col2')

Unnamed: 0,col1,col2,col3
0,1,444,abc
3,4,444,xyz
1,2,555,def
2,3,666,ghi


In [67]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,col3,col2,col1
0,abc,444,1
1,def,555,2
2,ghi,666,3
3,xyz,444,4
