# DataFrames in Pandas

Adding columns, deleting columns, and slicing data in Pandas

In [3]:
import pandas as pd
import numpy as np

In [4]:
col1 = [100, 200]  # a list
col2=['M', 'F']
col3=['John', 'Mary']

# a dictionary
data = { 'c1' : col1,
         'c2' : col2,
         'c3' : col3 
       }

# Create dataframe
df = pd.DataFrame(data)
df.columns = ['col1', 'col2', 'col3']

df

Unnamed: 0,col1,col2,col3
0,100,M,John
1,200,F,Mary


Apply a function to each row of a column and create a new column.

In [5]:
def myfunction(x):
    if x == 'M':
        return 'Male'
    else:
        return 'Female'
    
df["col2_full"] = df.col2.apply(myfunction)

df

Unnamed: 0,col1,col2,col3,col2_full
0,100,M,John,Male
1,200,F,Mary,Female


Delete a column from a DataFrame.

In [6]:
del df["col2_full"]

df

Unnamed: 0,col1,col2,col3
0,100,M,John
1,200,F,Mary


Add a new column into a DataFrame.

In [7]:
df['NewCol'] = 6

df

Unnamed: 0,col1,col2,col3,NewCol
0,100,M,John,6
1,200,F,Mary,6


In [8]:
del df['NewCol']

df

Unnamed: 0,col1,col2,col3
0,100,M,John
1,200,F,Mary


Display the index of a DataFrame. Our DataFrame only has two rows, and the two indices are 0 and 1.

In [9]:
df.index  # index (row labels) of the DataFrame.

RangeIndex(start=0, stop=2, step=1)

The 'columns' and 'index' values can be changed as shown;

In [11]:
df.index = ['a', 'b']  # change the name of the index                
df.columns =['Credit', 'Gender', 'Name'] # change the name of the column

df

Unnamed: 0,Credit,Gender,Name
a,100,M,John
b,200,F,Mary


In [None]:
# iloc - integer-location based indexing for selection by position
df.iloc [0]   #  row 0

In [None]:
df.iloc[0, 1] # row 0, column 1

In [None]:
df.iloc[:2]  # slicing  - row 0 to row 1

In [None]:
df.iloc[:2, :]  # slicing  - row 0 to row 1

In [None]:
df.iloc[:2, 1:3]  # slicing  - row 0 to row 1, column 1 to 2

In [None]:
df.iloc[:5, 1:5] # slice indexers which allow out-of-bounds indexing

In [None]:
df.loc['a':'b']  # loc - access by label(s). Note: iloc - is integer-based access

In [None]:
df.loc['a':'d']

In [None]:
df[['Credit', 'Gender']]

In [None]:
df.head()

In [None]:
df.tail()

Select rows

In [None]:
students = { 'name' : ['Jennifer', 'Lawrence', 'Harry', 'Tom'],
            'age'   : [10, 10, 11, 12],
            'marks' : [70, 80, 80, np.nan],
            'level' : ['1','1','2', '2']}

df = pd.DataFrame(students)

df

In [None]:
#  checks for which rows the Age column has a value larger than 10
df['age'] > 10

In [None]:
# select a number of rows
df [(df['age'] < 11) | (df['age'] > 11)  ]

In [None]:
# select a number of rows and interested only in a specific columns
df_old = df.loc[df['age'] > 10, ['name', 'age']]

df_old

In [None]:
# check for missing data - isnull()
df['marks'].isnull()

In [None]:
# extract those rows with missing marks
df_missing_marks = df [(df['marks'].isnull())]

df_missing_marks

In [None]:
# extract those rows with marks 
# notna() : detect existing (non-missing) values.
df_new = df [(df['marks'].notna()) ]

df_new