# Basic data structures in pandas

In [16]:
import pandas as pd
import numpy as np

Pandas provides two types of classes for handling data:

# Series: 
    A one-dimensional labeled array holding data of any type such as integers, strings, Python objects etc.

# DataFrame:
    A two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

# Object Creation


Creating a Series by passing a list of values, letting pandas create a default RangeIndex.

In [17]:
s = pd.Series([1,3,5,np.nan,6,8])

In [11]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:

In [18]:
dates = pd.date_range("20240824",periods=8)

In [19]:
dates

DatetimeIndex(['2024-08-24', '2024-08-25', '2024-08-26', '2024-08-27',
               '2024-08-28', '2024-08-29', '2024-08-30', '2024-08-31'],
              dtype='datetime64[ns]', freq='D')

In [20]:
    df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))

ValueError: Shape of passed values is (6, 4), indices imply (8, 4)

In [21]:
    df = pd.DataFrame(np.random.randn(8,4), index=dates, columns=list("ABCD"))

In [22]:
df

Unnamed: 0,A,B,C,D
2024-08-24,-0.271736,-0.489213,-1.905769,0.604402
2024-08-25,-0.417909,0.153747,0.81614,2.077354
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257
2024-08-27,-1.323593,-0.341142,-1.228217,0.246336
2024-08-28,1.813231,-0.503493,0.765582,-1.735425
2024-08-29,1.476087,1.068679,-0.855993,1.176333
2024-08-30,0.641927,-0.505017,0.978035,-0.221271
2024-08-31,-0.620606,-1.555799,-0.358539,1.095704


Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.

In [23]:
df2 = pd.DataFrame({
    "A":1.0,
})

ValueError: If using all scalar values, you must pass an index

In [24]:
df2 = pd.DataFrame({
    "A":1.0,
    "B":pd.TimeStamp("20240822"),
})

AttributeError: module 'pandas' has no attribute 'TimeStamp'

In [25]:
df2 = pd.DataFrame({
    "A":1.0,
    "B":pd.Timestamp("20240822"),
    "C":pd.Series(1,index=list(range(4),dtype="float32"),
    "D":np.array([3]*4, dtype="int32"),
    "F":"Foo"
})

SyntaxError: closing parenthesis '}' does not match opening parenthesis '(' on line 4 (3630114223.py, line 7)

In [26]:
df2 = pd.DataFrame({
    "A":1.0,
    "B":pd.Timestamp("20240822"),
    "C":pd.Series(1,index=list(range(4)),dtype="float32"),
    "D":np.array([3]*4, dtype="int32"),
    "F":"Foo"
})

In [27]:
df2

Unnamed: 0,A,B,C,D,F
0,1.0,2024-08-22,1.0,3,Foo
1,1.0,2024-08-22,1.0,3,Foo
2,1.0,2024-08-22,1.0,3,Foo
3,1.0,2024-08-22,1.0,3,Foo


In [28]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
F           object
dtype: object

# Viewing data

# Use DataFrame.head()  to view the top rows of frame

In [29]:
df.head

<bound method NDFrame.head of                    A         B         C         D
2024-08-24 -0.271736 -0.489213 -1.905769  0.604402
2024-08-25 -0.417909  0.153747  0.816140  2.077354
2024-08-26 -0.707825 -0.101105  0.107873 -0.582570
2024-08-27 -1.323593 -0.341142 -1.228217  0.246336
2024-08-28  1.813231 -0.503493  0.765582 -1.735425
2024-08-29  1.476087  1.068679 -0.855993  1.176333
2024-08-30  0.641927 -0.505017  0.978035 -0.221271
2024-08-31 -0.620606 -1.555799 -0.358539  1.095704>

In [30]:
df.head(3)

Unnamed: 0,A,B,C,D
2024-08-24,-0.271736,-0.489213,-1.905769,0.604402
2024-08-25,-0.417909,0.153747,0.81614,2.077354
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257


# Use  DataFrame.tail() to view the botoom rows of frame

In [31]:
df.tail

<bound method NDFrame.tail of                    A         B         C         D
2024-08-24 -0.271736 -0.489213 -1.905769  0.604402
2024-08-25 -0.417909  0.153747  0.816140  2.077354
2024-08-26 -0.707825 -0.101105  0.107873 -0.582570
2024-08-27 -1.323593 -0.341142 -1.228217  0.246336
2024-08-28  1.813231 -0.503493  0.765582 -1.735425
2024-08-29  1.476087  1.068679 -0.855993  1.176333
2024-08-30  0.641927 -0.505017  0.978035 -0.221271
2024-08-31 -0.620606 -1.555799 -0.358539  1.095704>

In [32]:
df.tail(4)

Unnamed: 0,A,B,C,D
2024-08-28,1.813231,-0.503493,0.765582,-1.735425
2024-08-29,1.476087,1.068679,-0.855993,1.176333
2024-08-30,0.641927,-0.505017,0.978035,-0.221271
2024-08-31,-0.620606,-1.555799,-0.358539,1.095704


# To Display the DataFrame.index

In [33]:
df.index

DatetimeIndex(['2024-08-24', '2024-08-25', '2024-08-26', '2024-08-27',
               '2024-08-28', '2024-08-29', '2024-08-30', '2024-08-31'],
              dtype='datetime64[ns]', freq='D')

# To Display the DataFrame.columns

In [34]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

# Return a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column labels:()

In [35]:
df.to_numpy()

array([[-0.27173625, -0.48921347, -1.9057692 ,  0.60440215],
       [-0.41790881,  0.15374669,  0.81614015,  2.07735363],
       [-0.70782541, -0.10110512,  0.10787273, -0.58257   ],
       [-1.32359279, -0.34114155, -1.22821687,  0.24633594],
       [ 1.81323126, -0.50349287,  0.7655825 , -1.73542493],
       [ 1.4760875 ,  1.06867912, -0.85599286,  1.17633343],
       [ 0.64192674, -0.50501681,  0.97803454, -0.22127124],
       [-0.62060561, -1.55579859, -0.35853857,  1.09570366]])

# NumPy arrays have one dtype for the entire array while pandas DataFrames have one dtype per column
 When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. If the common data type is object, DataFrame.to_numpy() will require copying data.


In [36]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
F           object
dtype: object

# describe() shows a quick statistic summary of your data

In [37]:
df.describe()

Unnamed: 0,A,B,C,D
count,8.0,8.0,8.0,8.0
mean,0.073697,-0.284168,-0.210111,0.332608
std,1.116232,0.738211,1.060254,1.185672
min,-1.323593,-1.555799,-1.905769,-1.735425
25%,-0.642411,-0.503874,-0.949049,-0.311596
50%,-0.344823,-0.415178,-0.125333,0.425369
75%,0.850467,-0.037392,0.778222,1.115861
max,1.813231,1.068679,0.978035,2.077354


# Transposing your data:

In [38]:
df.T

Unnamed: 0,2024-08-24,2024-08-25,2024-08-26,2024-08-27,2024-08-28,2024-08-29,2024-08-30,2024-08-31
A,-0.271736,-0.417909,-0.707825,-1.323593,1.813231,1.476087,0.641927,-0.620606
B,-0.489213,0.153747,-0.101105,-0.341142,-0.503493,1.068679,-0.505017,-1.555799
C,-1.905769,0.81614,0.107873,-1.228217,0.765582,-0.855993,0.978035,-0.358539
D,0.604402,2.077354,-0.58257,0.246336,-1.735425,1.176333,-0.221271,1.095704


# DataFrame.sort_index() sorts by an axis:

In [39]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2024-08-24,0.604402,-1.905769,-0.489213,-0.271736
2024-08-25,2.077354,0.81614,0.153747,-0.417909
2024-08-26,-0.58257,0.107873,-0.101105,-0.707825
2024-08-27,0.246336,-1.228217,-0.341142,-1.323593
2024-08-28,-1.735425,0.765582,-0.503493,1.813231
2024-08-29,1.176333,-0.855993,1.068679,1.476087
2024-08-30,-0.221271,0.978035,-0.505017,0.641927
2024-08-31,1.095704,-0.358539,-1.555799,-0.620606


# Dataframe.sort_values() sorts by values:

In [41]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2024-08-31,-0.620606,-1.555799,-0.358539,1.095704
2024-08-30,0.641927,-0.505017,0.978035,-0.221271
2024-08-28,1.813231,-0.503493,0.765582,-1.735425
2024-08-24,-0.271736,-0.489213,-1.905769,0.604402
2024-08-27,-1.323593,-0.341142,-1.228217,0.246336
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257
2024-08-25,-0.417909,0.153747,0.81614,2.077354
2024-08-29,1.476087,1.068679,-0.855993,1.176333


# Selection

# Note
     standard Python / NumPy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, DataFrame.at(), DataFrame.iat(), DataFrame.loc() and DataFrame.iloc().

for more use See the indexing documentation Indexing and Selecting Data and MultiIndex / Advanced Indexing.

# Getitem ([])

For a DataFrame, passing a single label selects a columns and yields a Series equivalent to df.A:

In [48]:
df["A"]

2024-08-24   -0.271736
2024-08-25   -0.417909
2024-08-26   -0.707825
2024-08-27   -1.323593
2024-08-28    1.813231
2024-08-29    1.476087
2024-08-30    0.641927
2024-08-31   -0.620606
Freq: D, Name: A, dtype: float64

# For a DataFrame, passing a slice : selects matching rows:

In [49]:
df[0:3]

Unnamed: 0,A,B,C,D
2024-08-24,-0.271736,-0.489213,-1.905769,0.604402
2024-08-25,-0.417909,0.153747,0.81614,2.077354
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257


In [50]:
df["2024-08-25":"2024-08-31"]

Unnamed: 0,A,B,C,D
2024-08-25,-0.417909,0.153747,0.81614,2.077354
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257
2024-08-27,-1.323593,-0.341142,-1.228217,0.246336
2024-08-28,1.813231,-0.503493,0.765582,-1.735425
2024-08-29,1.476087,1.068679,-0.855993,1.176333
2024-08-30,0.641927,-0.505017,0.978035,-0.221271
2024-08-31,-0.620606,-1.555799,-0.358539,1.095704


# Selection by label

See more in Selection by Label using DataFrame.loc() or DataFrame.at().

# Selecting a row matching a label

In [54]:
df.loc(date[0])

NameError: name 'date' is not defined

In [55]:
df.loc(dates[0])

ValueError: No axis named 2024-08-24 00:00:00 for object type DataFrame

In [56]:
df.loc[dates[0]]

A   -0.271736
B   -0.489213
C   -1.905769
D    0.604402
Name: 2024-08-24 00:00:00, dtype: float64

# Selecting all rows (:) with a select column labels:

In [57]:
df.loc[:, ["A","B"]]

Unnamed: 0,A,B
2024-08-24,-0.271736,-0.489213
2024-08-25,-0.417909,0.153747
2024-08-26,-0.707825,-0.101105
2024-08-27,-1.323593,-0.341142
2024-08-28,1.813231,-0.503493
2024-08-29,1.476087,1.068679
2024-08-30,0.641927,-0.505017
2024-08-31,-0.620606,-1.555799


the : slice all row and the selecting the cloumn that we want to selecte(A,B)

In [58]:
df.loc[:, :]

Unnamed: 0,A,B,C,D
2024-08-24,-0.271736,-0.489213,-1.905769,0.604402
2024-08-25,-0.417909,0.153747,0.81614,2.077354
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257
2024-08-27,-1.323593,-0.341142,-1.228217,0.246336
2024-08-28,1.813231,-0.503493,0.765582,-1.735425
2024-08-29,1.476087,1.068679,-0.855993,1.176333
2024-08-30,0.641927,-0.505017,0.978035,-0.221271
2024-08-31,-0.620606,-1.555799,-0.358539,1.095704


In [59]:
df.loc[["2024-08-24", "2024-08-26	"], :]

Unnamed: 0,A,B,C,D
2024-08-24,-0.271736,-0.489213,-1.905769,0.604402
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257


In [60]:
df.loc[["2024-08-24", "2024-08-26"], ["C","D"]]

Unnamed: 0,C,D
2024-08-24,-1.905769,0.604402
2024-08-26,0.107873,-0.58257


# For label slicing, both endpoints are included:

In [61]:
df.loc["2024-08-24":"2024-08-28", ["A","D"]]

Unnamed: 0,A,D
2024-08-24,-0.271736,0.604402
2024-08-25,-0.417909,2.077354
2024-08-26,-0.707825,-0.58257
2024-08-27,-1.323593,0.246336
2024-08-28,1.813231,-1.735425


Selecting a single row and column label returns a scalar:

In [62]:
df.loc[dates[0], "A"]

-0.27173625228690595

# For getting fast access to a scalar (equivalent to the prior method):

In [63]:
df.at[dates[0], "A"]

-0.27173625228690595

# Selection by position

See more in Selection by Position using DataFrame.iloc() or DataFrame.iat().

# Select via the position of the passed integers:

In [64]:
df.ioc[3]

AttributeError: 'DataFrame' object has no attribute 'ioc'

In [65]:
df.iloc[3]

A   -1.323593
B   -0.341142
C   -1.228217
D    0.246336
Name: 2024-08-27 00:00:00, dtype: float64

# Integer slices acts similar to NumPy/Python:

In [66]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2024-08-27,-1.323593,-0.341142
2024-08-28,1.813231,-0.503493


# Lists of integer position locations:

In [68]:
df.iloc[[1,2,3,4],[0,3]]

Unnamed: 0,A,D
2024-08-25,-0.417909,2.077354
2024-08-26,-0.707825,-0.58257
2024-08-27,-1.323593,0.246336
2024-08-28,1.813231,-1.735425


# For slicing rows explicitly

In [70]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2024-08-25,-0.417909,0.153747,0.81614,2.077354
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257


# For slicing columns explicitly:

In [71]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2024-08-24,-0.489213,-1.905769
2024-08-25,0.153747,0.81614
2024-08-26,-0.101105,0.107873
2024-08-27,-0.341142,-1.228217
2024-08-28,-0.503493,0.765582
2024-08-29,1.068679,-0.855993
2024-08-30,-0.505017,0.978035
2024-08-31,-1.555799,-0.358539


# For getting a value explicitly:

In [72]:
df.iloc[1,1]

0.15374669340604555

In [73]:
df.iloc[2,3]

-0.5825699968617073

In [74]:
df.iloc[1,9]

IndexError: index 9 is out of bounds for axis 0 with size 4

In [75]:
df.iloc[4,3]

-1.7354249315201138

# For getting fast access to a scalar (equivalent to the prior method):

In [76]:
df.iat[1,1]

0.15374669340604555

# Boolean indexing

# Select rows where df.A is greater than 0.

In [78]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2024-08-28,1.813231,-0.503493,0.765582,-1.735425
2024-08-29,1.476087,1.068679,-0.855993,1.176333
2024-08-30,0.641927,-0.505017,0.978035,-0.221271


# Selecting values from a DataFrame where a boolean condition is met:

In [79]:
df[df > 0]

Unnamed: 0,A,B,C,D
2024-08-24,,,,0.604402
2024-08-25,,0.153747,0.81614,2.077354
2024-08-26,,,0.107873,
2024-08-27,,,,0.246336
2024-08-28,1.813231,,0.765582,
2024-08-29,1.476087,1.068679,,1.176333
2024-08-30,0.641927,,0.978035,
2024-08-31,,,,1.095704


# Using isin() method for filtering:

In [80]:
df2 = df.copy()

In [81]:
df2["E"] = ["one", "two", "three", "foure", "five", "six", "seven"]

ValueError: Length of values (7) does not match length of index (8)

In [83]:
df2["E"] = ["Zero","one", "two", "three", "foure", "five", "six", "seven"]

In [84]:
df2

Unnamed: 0,A,B,C,D,E
2024-08-24,-0.271736,-0.489213,-1.905769,0.604402,Zero
2024-08-25,-0.417909,0.153747,0.81614,2.077354,one
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257,two
2024-08-27,-1.323593,-0.341142,-1.228217,0.246336,three
2024-08-28,1.813231,-0.503493,0.765582,-1.735425,foure
2024-08-29,1.476087,1.068679,-0.855993,1.176333,five
2024-08-30,0.641927,-0.505017,0.978035,-0.221271,six
2024-08-31,-0.620606,-1.555799,-0.358539,1.095704,seven


In [86]:
df2[df2["E"].isin(["two", "six"])]

Unnamed: 0,A,B,C,D,E
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257,two
2024-08-30,0.641927,-0.505017,0.978035,-0.221271,six


# Setting

# Setting a new coloum automaticaly aligns the dara by the indexes

In [88]:
s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range("20240828", periods=6 ))

In [89]:
s1

2024-08-28    1
2024-08-29    2
2024-08-30    3
2024-08-31    4
2024-09-01    5
2024-09-02    6
Freq: D, dtype: int64

In [90]:
df["f"]  = s1

In [91]:
df

Unnamed: 0,A,B,C,D,f
2024-08-24,-0.271736,-0.489213,-1.905769,0.604402,
2024-08-25,-0.417909,0.153747,0.81614,2.077354,
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257,
2024-08-27,-1.323593,-0.341142,-1.228217,0.246336,
2024-08-28,1.813231,-0.503493,0.765582,-1.735425,1.0
2024-08-29,1.476087,1.068679,-0.855993,1.176333,2.0
2024-08-30,0.641927,-0.505017,0.978035,-0.221271,3.0
2024-08-31,-0.620606,-1.555799,-0.358539,1.095704,4.0


# Setting values by label:

In [92]:
df.at[dates[0], "A"] = 0

In [93]:
df

Unnamed: 0,A,B,C,D,f
2024-08-24,0.0,-0.489213,-1.905769,0.604402,
2024-08-25,-0.417909,0.153747,0.81614,2.077354,
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257,
2024-08-27,-1.323593,-0.341142,-1.228217,0.246336,
2024-08-28,1.813231,-0.503493,0.765582,-1.735425,1.0
2024-08-29,1.476087,1.068679,-0.855993,1.176333,2.0
2024-08-30,0.641927,-0.505017,0.978035,-0.221271,3.0
2024-08-31,-0.620606,-1.555799,-0.358539,1.095704,4.0


# Setting values by position:

In [94]:
df.iat[0 , 1] = 0

In [95]:
df

Unnamed: 0,A,B,C,D,f
2024-08-24,0.0,0.0,-1.905769,0.604402,
2024-08-25,-0.417909,0.153747,0.81614,2.077354,
2024-08-26,-0.707825,-0.101105,0.107873,-0.58257,
2024-08-27,-1.323593,-0.341142,-1.228217,0.246336,
2024-08-28,1.813231,-0.503493,0.765582,-1.735425,1.0
2024-08-29,1.476087,1.068679,-0.855993,1.176333,2.0
2024-08-30,0.641927,-0.505017,0.978035,-0.221271,3.0
2024-08-31,-0.620606,-1.555799,-0.358539,1.095704,4.0


# Setting by assigning with a NumPy array:

In [96]:
df.loc[:, "D"]= np.array([5]*len(df))

In [97]:
df

Unnamed: 0,A,B,C,D,f
2024-08-24,0.0,0.0,-1.905769,5.0,
2024-08-25,-0.417909,0.153747,0.81614,5.0,
2024-08-26,-0.707825,-0.101105,0.107873,5.0,
2024-08-27,-1.323593,-0.341142,-1.228217,5.0,
2024-08-28,1.813231,-0.503493,0.765582,5.0,1.0
2024-08-29,1.476087,1.068679,-0.855993,5.0,2.0
2024-08-30,0.641927,-0.505017,0.978035,5.0,3.0
2024-08-31,-0.620606,-1.555799,-0.358539,5.0,4.0


# where operation with setting:

In [98]:
df2 = df.copy()

In [99]:
df2[df2>0] = -df2

In [100]:
df2

Unnamed: 0,A,B,C,D,f
2024-08-24,0.0,0.0,-1.905769,-5.0,
2024-08-25,-0.417909,-0.153747,-0.81614,-5.0,
2024-08-26,-0.707825,-0.101105,-0.107873,-5.0,
2024-08-27,-1.323593,-0.341142,-1.228217,-5.0,
2024-08-28,-1.813231,-0.503493,-0.765582,-5.0,-1.0
2024-08-29,-1.476087,-1.068679,-0.855993,-5.0,-2.0
2024-08-30,-0.641927,-0.505017,-0.978035,-5.0,-3.0
2024-08-31,-0.620606,-1.555799,-0.358539,-5.0,-4.0
