# Pandas-5: Miscellaneous Opearations on DataFrame


### Table of Content

* Apply a Built-In or User Defined Function to a DataFrame using apply() and map()
* Date & Time in Pandas DataFrame - TimeStamp and TimeDelta
* Operations on Date and Time

In [1]:
# import
import numpy as np
import pandas as pd

**df.apply() method**

In [2]:
# Create demo DataFrame with real-looking model names and metrics
data = {
    "model_name": [
        "Logistic Regression", 
        "Decision Tree", 
        "Random Forest", 
        "XGBoost", 
        "LightGBM", 
        "KNN", 
        "Naive Bayes", 
        "SVM", 
        "CNN", 
        "LSTM"
    ],
    "execution_time": [0.5, 0.8, 2.3, 3.1, 2.8, 1.2, 0.6, 1.9, 4.5, 5.0],  # seconds
    "model_size": [2.3, 5.6, 45.2, 60.1, 55.8, 8.4, 1.2, 12.3, 120.5, 150.7],  # MB
    "accuracy": [0.82, 0.78, 0.89, 0.91, 0.90, 0.76, 0.74, 0.88, 0.93, 0.92],
    "f1_score": [0.80, 0.75, 0.87, 0.90, 0.89, 0.74, 0.72, 0.86, 0.92, 0.91],
    "precision": [0.81, 0.76, 0.88, 0.91, 0.90, 0.75, 0.73, 0.87, 0.93, 0.92],
    "recall": [0.79, 0.74, 0.86, 0.89, 0.88, 0.73, 0.71, 0.85, 0.91, 0.90],
}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,model_name,execution_time,model_size,accuracy,f1_score,precision,recall
0,Logistic Regression,0.5,2.3,0.82,0.8,0.81,0.79
1,Decision Tree,0.8,5.6,0.78,0.75,0.76,0.74
2,Random Forest,2.3,45.2,0.89,0.87,0.88,0.86
3,XGBoost,3.1,60.1,0.91,0.9,0.91,0.89
4,LightGBM,2.8,55.8,0.9,0.89,0.9,0.88
5,KNN,1.2,8.4,0.76,0.74,0.75,0.73
6,Naive Bayes,0.6,1.2,0.74,0.72,0.73,0.71
7,SVM,1.9,12.3,0.88,0.86,0.87,0.85
8,CNN,4.5,120.5,0.93,0.92,0.93,0.91
9,LSTM,5.0,150.7,0.92,0.91,0.92,0.9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   model_name      10 non-null     object 
 1   execution_time  10 non-null     float64
 2   model_size      10 non-null     float64
 3   accuracy        10 non-null     float64
 4   f1_score        10 non-null     float64
 5   precision       10 non-null     float64
 6   recall          10 non-null     float64
dtypes: float64(6), object(1)
memory usage: 692.0+ bytes


**Apply User Defined Functions**

In [5]:
# Define a function which converts execution time from second to milli-seconds
def testfunc(x):
    model_name = "_".join(x.split())
    model_name = model_name.lower()
    return model_name

In [6]:
# convert model name to lowercase and space by underscore
df['model_name'].apply(testfunc)

0    logistic_regression
1          decision_tree
2          random_forest
3                xgboost
4               lightgbm
5                    knn
6            naive_bayes
7                    svm
8                    cnn
9                   lstm
Name: model_name, dtype: object

In [9]:
# using map() instead of apply()
df['model_name'].map(testfunc)

0    logistic_regression
1          decision_tree
2          random_forest
3                xgboost
4               lightgbm
5                    knn
6            naive_bayes
7                    svm
8                    cnn
9                   lstm
Name: model_name, dtype: object

**apply vs map**
* .map() -> Works only on Series, element-wise.
* .apply() -> Works on Series or DataFrame, more flexible (row/column-wise).

In [10]:
# convert execution time from second to milli-second - using apply
df['execution_time'].apply(lambda x: x*1000) # column-wise

0     500.0
1     800.0
2    2300.0
3    3100.0
4    2800.0
5    1200.0
6     600.0
7    1900.0
8    4500.0
9    5000.0
Name: execution_time, dtype: float64

In [11]:
# convert execution time from second to milli-second - using map
df['execution_time'].map(lambda x: x*1000) # column-wise

0     500.0
1     800.0
2    2300.0
3    3100.0
4    2800.0
5    1200.0
6     600.0
7    1900.0
8    4500.0
9    5000.0
Name: execution_time, dtype: float64

**Calculate Model Score**

In [12]:
df.head()

Unnamed: 0,model_name,execution_time,model_size,accuracy,f1_score,precision,recall
0,Logistic Regression,0.5,2.3,0.82,0.8,0.81,0.79
1,Decision Tree,0.8,5.6,0.78,0.75,0.76,0.74
2,Random Forest,2.3,45.2,0.89,0.87,0.88,0.86
3,XGBoost,3.1,60.1,0.91,0.9,0.91,0.89
4,LightGBM,2.8,55.8,0.9,0.89,0.9,0.88


In [13]:
# model_score = 0.8*f1_score + 0.1*(1/execution_time) + 0.1*(1/model_size)
df["model_score"] = (
    0.8 * df["f1_score"] +
    0.1 * (1 / df["execution_time"]) +
    0.1 * (1 / df["model_size"])
)

In [14]:
df

Unnamed: 0,model_name,execution_time,model_size,accuracy,f1_score,precision,recall,model_score
0,Logistic Regression,0.5,2.3,0.82,0.8,0.81,0.79,0.883478
1,Decision Tree,0.8,5.6,0.78,0.75,0.76,0.74,0.742857
2,Random Forest,2.3,45.2,0.89,0.87,0.88,0.86,0.741691
3,XGBoost,3.1,60.1,0.91,0.9,0.91,0.89,0.753922
4,LightGBM,2.8,55.8,0.9,0.89,0.9,0.88,0.749506
5,KNN,1.2,8.4,0.76,0.74,0.75,0.73,0.687238
6,Naive Bayes,0.6,1.2,0.74,0.72,0.73,0.71,0.826
7,SVM,1.9,12.3,0.88,0.86,0.87,0.85,0.748762
8,CNN,4.5,120.5,0.93,0.92,0.93,0.91,0.759052
9,LSTM,5.0,150.7,0.92,0.91,0.92,0.9,0.748664


In [15]:
# Even though Random Forest model have high f1 score than decision tree, the overall score for RF is less than DT

**Apply Built In Functions**

In [16]:
df['len_model_name']= df['model_name'].apply(len)
df

Unnamed: 0,model_name,execution_time,model_size,accuracy,f1_score,precision,recall,model_score,len_model_name
0,Logistic Regression,0.5,2.3,0.82,0.8,0.81,0.79,0.883478,19
1,Decision Tree,0.8,5.6,0.78,0.75,0.76,0.74,0.742857,13
2,Random Forest,2.3,45.2,0.89,0.87,0.88,0.86,0.741691,13
3,XGBoost,3.1,60.1,0.91,0.9,0.91,0.89,0.753922,7
4,LightGBM,2.8,55.8,0.9,0.89,0.9,0.88,0.749506,8
5,KNN,1.2,8.4,0.76,0.74,0.75,0.73,0.687238,3
6,Naive Bayes,0.6,1.2,0.74,0.72,0.73,0.71,0.826,11
7,SVM,1.9,12.3,0.88,0.86,0.87,0.85,0.748762,3
8,CNN,4.5,120.5,0.93,0.92,0.93,0.91,0.759052,3
9,LSTM,5.0,150.7,0.92,0.91,0.92,0.9,0.748664,4


In [17]:
df['sqrt_precision'] = df['precision'].apply(lambda x: np.sqrt(x))
df

Unnamed: 0,model_name,execution_time,model_size,accuracy,f1_score,precision,recall,model_score,len_model_name,sqrt_precision
0,Logistic Regression,0.5,2.3,0.82,0.8,0.81,0.79,0.883478,19,0.9
1,Decision Tree,0.8,5.6,0.78,0.75,0.76,0.74,0.742857,13,0.87178
2,Random Forest,2.3,45.2,0.89,0.87,0.88,0.86,0.741691,13,0.938083
3,XGBoost,3.1,60.1,0.91,0.9,0.91,0.89,0.753922,7,0.953939
4,LightGBM,2.8,55.8,0.9,0.89,0.9,0.88,0.749506,8,0.948683
5,KNN,1.2,8.4,0.76,0.74,0.75,0.73,0.687238,3,0.866025
6,Naive Bayes,0.6,1.2,0.74,0.72,0.73,0.71,0.826,11,0.8544
7,SVM,1.9,12.3,0.88,0.86,0.87,0.85,0.748762,3,0.932738
8,CNN,4.5,120.5,0.93,0.92,0.93,0.91,0.759052,3,0.964365
9,LSTM,5.0,150.7,0.92,0.91,0.92,0.9,0.748664,4,0.959166


In [18]:
# sort values based on columns
df.sort_values(by='model_score',ascending=False) #inplace=False by default

Unnamed: 0,model_name,execution_time,model_size,accuracy,f1_score,precision,recall,model_score,len_model_name,sqrt_precision
0,Logistic Regression,0.5,2.3,0.82,0.8,0.81,0.79,0.883478,19,0.9
6,Naive Bayes,0.6,1.2,0.74,0.72,0.73,0.71,0.826,11,0.8544
8,CNN,4.5,120.5,0.93,0.92,0.93,0.91,0.759052,3,0.964365
3,XGBoost,3.1,60.1,0.91,0.9,0.91,0.89,0.753922,7,0.953939
4,LightGBM,2.8,55.8,0.9,0.89,0.9,0.88,0.749506,8,0.948683
7,SVM,1.9,12.3,0.88,0.86,0.87,0.85,0.748762,3,0.932738
9,LSTM,5.0,150.7,0.92,0.91,0.92,0.9,0.748664,4,0.959166
1,Decision Tree,0.8,5.6,0.78,0.75,0.76,0.74,0.742857,13,0.87178
2,Random Forest,2.3,45.2,0.89,0.87,0.88,0.86,0.741691,13,0.938083
5,KNN,1.2,8.4,0.76,0.74,0.75,0.73,0.687238,3,0.866025


#### Date & Time

* In Pandas, datetime olumns(Timestamp) is used to represent dates and times in a standardized format, which allows easy filtering, arithmetic, and plotting.
* Very useful data-type and also used in timeseries data.
* Benefits of datetime dtype : Easy filtering, Arithmetic, Extract components and Time Series opearations

* `pandas.Timestamp()`:
1. Constructor for a single timestamp object.(like Python’s datetime, but with more Pandas power).
2. Input: A string, datetime, or numeric value.
3. Output: A Timestamp object.
4. Best for: Working with one specific date/time.

In [26]:
# Create a Range of Dates
ts1 = pd.Timestamp("2025-08-18 14:30:15") # <class 'pandas._libs.tslibs.timestamps.Timestamp'>
ts2 = pd.Timestamp("2025-08-18")
ts3 = pd.Timestamp("18-08-2025")
ts4 = pd.Timestamp(year=2025, month=8, day=18, hour=14, minute=30, second=15)
print(ts1,"\n", ts2,"\n", ts3, "\n", ts4)
print(type(ts1),"\n", type(ts2),"\n", type(ts3),"\n", type(ts4))

2025-08-18 14:30:15 
 2025-08-18 00:00:00 
 2025-08-18 00:00:00 
 2025-08-18 14:30:15
<class 'pandas._libs.tslibs.timestamps.Timestamp'> 
 <class 'pandas._libs.tslibs.timestamps.Timestamp'> 
 <class 'pandas._libs.tslibs.timestamps.Timestamp'> 
 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


* `pandas.to_datetime()`: 
1. Purpose: Converts many kinds of inputs (strings, lists, arrays, Series, dict) into datetime-like values.
2. Input: Scalar (one value) OR iterable (list, Series).
3. Output: Timestamp (if scalar input), DatetimeIndex (if list/array input), Series of datetime64 (if Series input).
4. Best for: Parsing existing data into datetime format.

**DatetimeIndex - a sequence of Timestamps that can be used as an index for time series data.**

In [46]:
# Single string
print(pd.to_datetime("2025-08-18")) # <class 'pandas._libs.tslibs.timestamps.Timestamp'>

# List of strings
print(pd.to_datetime(["2025-08-18", "2025-08-19"])) # pandas.core.indexes.datetimes.DatetimeIndex

# Integer epoch values (nanoseconds seconds since 1970)
print(pd.to_datetime([1692355200, 1692441600], unit="s")) # pandas.core.indexes.datetimes.DatetimeIndex


2025-08-18 00:00:00
DatetimeIndex(['2025-08-18', '2025-08-19'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2023-08-18 10:40:00', '2023-08-19 10:40:00'], dtype='datetime64[ns]', freq=None)


* `pd.date_range()`: a function in Pandas that generates a sequence of dates (a DatetimeIndex) between a start and end date

In [47]:
ts1 = pd.Timestamp("2025-08-18 14:30:15")
pd.date_range(ts1, periods=5)

DatetimeIndex(['2025-08-18 14:30:15', '2025-08-19 14:30:15',
               '2025-08-20 14:30:15', '2025-08-21 14:30:15',
               '2025-08-22 14:30:15'],
              dtype='datetime64[ns]', freq='D')

In [48]:
start = pd.Timestamp(2025, 1, 1)
end   = pd.Timestamp(2025, 1, 5)
print(pd.date_range(start, end))

DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05'],
              dtype='datetime64[ns]', freq='D')


* `pd.Timedelta()`: Represents a duration, i.e., the difference between two dates or times.
1. Used For: Adding/subtracting time intervals from dates.

In [49]:
td = pd.Timedelta(days=5, hours=3)
print(td)
print(type(td))

5 days 03:00:00
<class 'pandas._libs.tslibs.timedeltas.Timedelta'>


In [50]:
ts = pd.Timestamp("2025-08-18 10:00:00")
new_ts = ts + pd.Timedelta(days=2, hours=5)
print(new_ts)

2025-08-20 15:00:00


**Creating a DataFrame with date columns**

In [51]:
data = {"date": ["2025-08-21", "2025-08-22", "2025-08-23"],
        "value": [10, 20, 30]}
df = pd.DataFrame(data)
df

Unnamed: 0,date,value
0,2025-08-21,10
1,2025-08-22,20
2,2025-08-23,30


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    3 non-null      object
 1   value   3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes


In [53]:
# Convert 'date' column to datetime
df["date"] = pd.to_datetime(df["date"])
print(df)

        date  value
0 2025-08-21     10
1 2025-08-22     20
2 2025-08-23     30


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    3 non-null      datetime64[ns]
 1   value   3 non-null      int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 180.0 bytes


In [55]:
df.iloc[0,0]

Timestamp('2025-08-21 00:00:00')

* So `datetime64[ns]` is the column dtype, while each individual element is a `Timestamp`.
* NumPy provides numpy.datetime64 to represent dates and times.
  
 **Pandas datetime = NumPy datetime64 + Timestamp wrapper + extra time-series features.**

Pandas does not use Python’s datetime directly for columns because:
- datetime objects are slower in vectorized operations. (similar to List concept)
- datetime64[ns] allows fast, memory-efficient operations on entire columns.
<div style="text-align: center;">
    <img src="datetime.png" width="300"/>
</div>

**Operations on Time & Date**

In [56]:
s = pd.Series(pd.date_range('2025-1-1', periods=3, freq='D'))
print(s)
td = pd.Series([ pd.Timedelta(hours=i) for i in range(3) ])
print(td)

0   2025-01-01
1   2025-01-02
2   2025-01-03
dtype: datetime64[ns]
0   0 days 00:00:00
1   0 days 01:00:00
2   0 days 02:00:00
dtype: timedelta64[ns]


In [57]:
df = pd.DataFrame(dict(A = s, B = td))
print(df)

           A               B
0 2025-01-01 0 days 00:00:00
1 2025-01-02 0 days 01:00:00
2 2025-01-03 0 days 02:00:00


In [58]:
df['C']=df['A']+df['B']
print(df)

           A               B                   C
0 2025-01-01 0 days 00:00:00 2025-01-01 00:00:00
1 2025-01-02 0 days 01:00:00 2025-01-02 01:00:00
2 2025-01-03 0 days 02:00:00 2025-01-03 02:00:00


**Filter operations on date-time columns**

In [59]:
# Sample data
data = {
    'Event': ['A', 'B', 'C', 'D', 'E'],
    'Date': pd.to_datetime([
        '2025-08-18', '2025-08-19', '2025-08-20', 
        '2025-08-21', '2025-08-22'
    ])
}

df = pd.DataFrame(data)
print(df)


  Event       Date
0     A 2025-08-18
1     B 2025-08-19
2     C 2025-08-20
3     D 2025-08-21
4     E 2025-08-22


In [None]:
# Filter by specific date
df[df['Date'] == '2025-08-20']

In [60]:
# Filter by date range
df[(df['Date'] >= '2025-08-19') & (df['Date'] <= '2025-08-21')]

Unnamed: 0,Event,Date
1,B,2025-08-19
2,C,2025-08-20
3,D,2025-08-21


In [61]:
# Filter by day of week
# Monday = 0, Sunday = 6
df[df['Date'].dt.dayofweek == 6]  # All Mondays

Unnamed: 0,Event,Date
0,A,2025-08-18


**Benefits of DateTimeIndex over regular column**

In [62]:
# Sample data
data = {
    'Date': pd.date_range('2025-08-18', periods=5, freq='D'),
    'Value': [10, 20, 30, 40, 50]
}

# DataFrame with regular datetime column
df_col = pd.DataFrame(data)

# DataFrame with DatetimeIndex
df_index = df_col.drop(columns='Date')
df_index.index = data['Date']

print("DataFrame with datetime column:\n", df_col)
print("\nDataFrame with DatetimeIndex:\n", df_index)

DataFrame with datetime column:
         Date  Value
0 2025-08-18     10
1 2025-08-19     20
2 2025-08-20     30
3 2025-08-21     40
4 2025-08-22     50

DataFrame with DatetimeIndex:
             Value
2025-08-18     10
2025-08-19     20
2025-08-20     30
2025-08-21     40
2025-08-22     50


In [63]:
# Filtering a specific date

# Using datetime column
print(df_col[df_col['Date'] == '2025-08-20'])
print("="*50)
# Using DatetimeIndex
print(df_index.loc['2025-08-20'])

        Date  Value
2 2025-08-20     30
Value    30
Name: 2025-08-20 00:00:00, dtype: int64


In [64]:
# Slicing by date range

# Using datetime column
print(df_col[(df_col['Date'] >= '2025-08-19') & (df_col['Date'] <= '2025-08-21')])
print("="*50)
# Using DatetimeIndex
print(df_index['2025-08-19':'2025-08-21'])

        Date  Value
1 2025-08-19     20
2 2025-08-20     30
3 2025-08-21     40
            Value
2025-08-19     20
2025-08-20     30
2025-08-21     40


---

Happy Learning ! Team DecodeAiML !!