# The Pandas DataFrame: Make Working With Data Delightful

[Article Link: ](https://realpython.com/pandas-dataframe/) https://realpython.com/pandas-dataframe/  

In this tutorial, you’ll learn:

* What a Pandas DataFrame is and how to create one
* How to access, modify, add, sort, filter, and delete data
* How to handle missing values
* How to work with time-series data
* How to quickly visualize data

In [10]:
import pandas as pd
import numpy as np

## Intro

In [11]:
# create a smaple dataframe from dictionary
user_data = {
    'name': ['Ananth', 'John', 'Wong', 'Souza', 'Jane'],
    'age': [21, 27, 24, 33, 19],
    'skill': ['data science', 'machine learning', 'front end designing', 'UI and UX', '3D modelling'],
    'country': ['Sri Lanka', 'United Kingdom', 'China', 'Netherlands', 'USA'],
    'disabled': [True, False, False, False, True]
    }
row_labels = ['a', 'b', 'c', 'd', 'e']


df0 = pd.DataFrame(user_data, index=row_labels)

display(df0)

Unnamed: 0,name,age,skill,country,disabled
a,Ananth,21,data science,Sri Lanka,True
b,John,27,machine learning,United Kingdom,False
c,Wong,24,front end designing,China,False
d,Souza,33,UI and UX,Netherlands,False
e,Jane,19,3D modelling,USA,True


In [12]:
# if the column name is a valid Python identifier, the column can be accessd using dot notation
display(df0.skill)

a           data science
b       machine learning
c    front end designing
d              UI and UX
e           3D modelling
Name: skill, dtype: object

In [13]:
# using index like dicitoinary key
display(df0.skill['c'])

# this won't work
# display(df0['b'])

# instead, loc can be used
df0.loc['c']

'front end designing'

name                       Wong
age                          24
skill       front end designing
country                   China
disabled                  False
Name: c, dtype: object

In [14]:
df = pd.DataFrame(user_data)

## Creating a DataFrame

In [15]:
# from a dicitionary
dict1 = {"x": [1, 2, 3],
         "y": np.array([2, 4, 6]),
         "z": 100}  

df1 = pd.DataFrame(dict1)
display(df1)


Unnamed: 0,x,y,z
0,1,2,100
1,2,4,100
2,3,6,100


In [16]:
# from a list of dictionaries
list2 = [
    {'x': 1, 'y': 2, 'z': 3},
    {'x': 4, 'y': 5, 'z': 6},
    {'x': 7, 'y': 8, 'z': 9}
]
df2 = pd.DataFrame(list2)
display(df2)


#  this one won't work properly
list3 = [
    {'x': 1, 'x': 2, 'x': 3},
    {'y': 4, 'y': 5, 'y': 6},
    {'z': 7, 'z': 8, 'z': 9}
]
df3 = pd.DataFrame(list3)
display(df3)


# from a list of lists
list4 = [
    [11, 22, 33],
    [14, 25, 36],
    [17, 30, 39]
]
df4 = pd.DataFrame(list4, columns=['x', 'y', 'z'])  # column names have to passed in explicitly, otherwise they would be auto named using integers, like index
display(df4)

Unnamed: 0,x,y,z
0,1,2,3
1,4,5,6
2,7,8,9


Unnamed: 0,x,y,z
0,3.0,,
1,,6.0,
2,,,9.0


Unnamed: 0,x,y,z
0,11,22,33
1,14,25,36
2,17,30,39


In [17]:
# from a numpy array
# only a 2 dimensional array would work in this case, any other dimensional arrays would result in a ValueError

arr5 = np.array(
    [
        [1, 10, 100],
        [2, 20, 200],
        [3, 30, 300]
    ]
)

# the following array, if passed into the df constructor fucntion, it would result in the following error
# ValueError: Must pass 2-d input. shape=(3, 3, 3)
arr_err1 = np.array(
    [
        [[1, 10, 100], [2, 20, 200], [3, 30, 300]],
        [[1, 10, 100], [2, 20, 200], [3, 30, 300]],
        [[1, 10, 100], [2, 20, 200], [3, 30, 300]]
    ]
)

df5 = pd.DataFrame(arr5, columns=['x', 'y', 'z'])
display(df5)

# pd.DataFrame() normally doesn't copy numpy arrays by default. so if the array is modified, the values in the dataframe would change as well
# however, this can be evaded by passing in `copy=True`

Unnamed: 0,x,y,z
0,1,10,100
1,2,20,200
2,3,30,300


In [18]:
# saving a dataframe to file on disk and reloading the dataframe from file
df0.to_csv('./user_data.csv')

dfuser = pd.read_csv('./user_data.csv', index_col=0)  # specifying the index column
display(dfuser)

FileNotFoundError: [Errno 2] No such file or directory: '.\\user_data.csv'

## Retrieving Labels and Data

In [None]:
# get index
print(dfuser.index)

# get colum names
print(dfuser.columns)

# set index
dfuser.index = np.arange(1, 6)
print(dfuser.index)

# a single element of df.index or df.columns can not be edited. will TypeError

In [None]:
# retrieving the data as numpy array
# any pandas series, dataframes or sequences can be changeed into a numpy array with the methods df.to_numpy of df.values
# however df.to_numpy() is more flexible. use copy=False if you want the values to be copied

print(dfuser.to_numpy())

print(type(dfuser.to_numpy()), type(dfuser.to_numpy()) is type(dfuser.values))

In [None]:
# get column datatypes
dfuser.dtypes

# change one/more column's dtype
dfuser.astype()