### Importing Libraries & Dataset Loading

In [1]:
import pandas as pd
import numpy as np
import os

# Load dataset (adjust path if needed)
data_path = "../dataset/CDR_Dataset.csv"
df = pd.read_csv(data_path, parse_dates=["call_start_time"])

print("Dataset loaded:", df.shape)
df.head()


Dataset loaded: (50000, 16)


Unnamed: 0,call_id,caller_id,callee_id,call_start_time,call_duration,call_type,call_cost,cell_tower_id,imei,imsi,cost_per_sec,call_hour,distinct_callees_last_24h,tower_switch_rate,repeated_short_calls_last_1h,label
0,42011,9438451486,9247462263,2024-01-01 00:06:11,3,voice,0.89,7644,117390930401422,499245974166170,0.2967,0,0,0.0,0,Not Fraud
1,14625,9117869328,9341768522,2024-01-01 00:06:38,101,VoIP,0.22,3461,918215718689853,838197234558148,0.0022,0,0,0.0,0,Not Fraud
2,34387,7619295089,9159834527,2024-01-01 00:11:14,50,voice,0.61,9586,468201770667381,653142698770788,0.0122,0,0,0.0,0,Not Fraud
3,12362,8967555928,9715071175,2024-01-01 00:11:46,25,voice,0.12,1908,898990439643248,940632686937075,0.0048,0,0,0.0,0,Not Fraud
4,20275,7792852021,8840717793,2024-01-01 00:13:10,228,voice,4.35,5774,843821226875108,620371213723727,0.0191,0,0,0.0,0,Not Fraud


### Basic datetime and call-type features

In [2]:
# Ensure datetime type
df["call_start_time"] = pd.to_datetime(df["call_start_time"], errors="coerce")

# Binary flag for international and VoIP
df["is_international"] = (df["call_type"].str.lower() == "international").astype(int)
df["is_voip"] = (df["call_type"].str.lower() == "voip").astype(int)
df["is_roaming"] = (df["call_type"].str.lower() == "roaming").astype(int)

# Extract additional time components
df["call_day"] = df["call_start_time"].dt.day
df["call_hour"] = df["call_start_time"].dt.hour
df["is_night_call"] = ((df["call_hour"] >= 22) | (df["call_hour"] < 6)).astype(int)

print("Basic call-type and time features created")
df[["call_start_time", "call_type", "is_international", "is_voip", "is_roaming", "is_night_call"]].head()


Basic call-type and time features created


Unnamed: 0,call_start_time,call_type,is_international,is_voip,is_roaming,is_night_call
0,2024-01-01 00:06:11,voice,0,0,0,1
1,2024-01-01 00:06:38,VoIP,0,1,0,1
2,2024-01-01 00:11:14,voice,0,0,0,1
3,2024-01-01 00:11:46,voice,0,0,0,1
4,2024-01-01 00:13:10,voice,0,0,0,1


### Rolling duration features

In [4]:
# Rolling average call duration per caller 
df["avg_duration_by_caller"] = (
    df.groupby("caller_id")["call_duration"].transform(lambda x: x.expanding().mean())
)

# Absolute deviation from caller's average
df["deviation_from_avg_duration"] = (df["call_duration"] - df["avg_duration_by_caller"]).abs()

# Log-transform call duration to reduce skew
df["call_duration_log"] = np.log1p(df["call_duration"])

print("Duration and deviation-based features added")
df[["caller_id", "call_duration", "avg_duration_by_caller", "deviation_from_avg_duration"]].head()


Duration and deviation-based features added


Unnamed: 0,caller_id,call_duration,avg_duration_by_caller,deviation_from_avg_duration
0,9438451486,3,3.0,0.0
1,9117869328,101,101.0,0.0
2,7619295089,50,50.0,0.0
3,8967555928,25,25.0,0.0
4,7792852021,228,228.0,0.0


### Device & tower-based behavioral features

In [5]:
# Number of unique IMEIs per caller 
imei_counts = df.groupby("caller_id")["imei"].nunique()
df["imei_change_rate"] = df["caller_id"].map(imei_counts)

# Number of unique towers used per caller 
tower_counts = df.groupby("caller_id")["cell_tower_id"].nunique()
df["unique_tower_count"] = df["caller_id"].map(tower_counts)

# Ratio of tower switches
df["tower_switch_rate_norm"] = df["tower_switch_rate"] / (df["unique_tower_count"] + 1e-9)

print("Device & tower behavior features added")
df[["caller_id", "imei_change_rate", "unique_tower_count", "tower_switch_rate_norm"]].head()


Device & tower behavior features added


Unnamed: 0,caller_id,imei_change_rate,unique_tower_count,tower_switch_rate_norm
0,9438451486,1,1,0.0
1,9117869328,1,1,0.0
2,7619295089,1,1,0.0
3,8967555928,1,1,0.0
4,7792852021,1,1,0.0


### Short-call and call frequency features

In [6]:
# Flag short calls (<10s)
df["short_call"] = (df["call_duration"] < 10).astype(int)

# Short call ratio per caller
short_call_ratio = df.groupby("caller_id")["short_call"].mean()
df["repeated_short_calls"] = df["caller_id"].map(short_call_ratio)

# Total calls and average duration per caller
caller_agg = df.groupby("caller_id").agg({
    "call_id": "count",
    "call_duration": "mean",
    "call_cost": "sum",
    "short_call": "mean"
}).reset_index()

caller_agg.columns = ["caller_id", "total_calls", "avg_call_duration_caller", "total_spent", "short_call_ratio_caller"]
df = df.merge(caller_agg, on="caller_id", how="left")

print("Call frequency & short-call stats added")
df.head()


Call frequency & short-call stats added


Unnamed: 0,call_id,caller_id,callee_id,call_start_time,call_duration,call_type,call_cost,cell_tower_id,imei,imsi,...,call_duration_log,imei_change_rate,unique_tower_count,tower_switch_rate_norm,short_call,repeated_short_calls,total_calls,avg_call_duration_caller,total_spent,short_call_ratio_caller
0,42011,9438451486,9247462263,2024-01-01 00:06:11,3,voice,0.89,7644,117390930401422,499245974166170,...,1.386294,1,1,0.0,1,1.0,1,3.0,0.89,1.0
1,14625,9117869328,9341768522,2024-01-01 00:06:38,101,VoIP,0.22,3461,918215718689853,838197234558148,...,4.624973,1,1,0.0,0,0.0,1,101.0,0.22,0.0
2,34387,7619295089,9159834527,2024-01-01 00:11:14,50,voice,0.61,9586,468201770667381,653142698770788,...,3.931826,1,1,0.0,0,0.0,1,50.0,0.61,0.0
3,12362,8967555928,9715071175,2024-01-01 00:11:46,25,voice,0.12,1908,898990439643248,940632686937075,...,3.258097,1,1,0.0,0,0.0,1,25.0,0.12,0.0
4,20275,7792852021,8840717793,2024-01-01 00:13:10,228,voice,4.35,5774,843821226875108,620371213723727,...,5.433722,1,1,0.0,0,0.0,1,228.0,4.35,0.0


### Cost and efficiency ratios

In [7]:
# Cost per second ratio
df["cost_per_sec_ratio"] = df["call_cost"] / (df["call_duration"] + 1e-9)

# Deviation of cost_per_sec from caller mean
caller_cost_mean = df.groupby("caller_id")["cost_per_sec_ratio"].transform("mean")
df["deviation_cost_per_sec"] = (df["cost_per_sec_ratio"] - caller_cost_mean).abs()

# High cost per sec flag (possible premium routing)
df["is_high_cost_call"] = (df["cost_per_sec_ratio"] > df["cost_per_sec_ratio"].quantile(0.99)).astype(int)

print("Cost and efficiency features created")
df[["call_cost", "call_duration", "cost_per_sec_ratio", "is_high_cost_call"]].head()


Cost and efficiency features created


Unnamed: 0,call_cost,call_duration,cost_per_sec_ratio,is_high_cost_call
0,0.89,3,0.296667,0
1,0.22,101,0.002178,0
2,0.61,50,0.0122,0
3,0.12,25,0.0048,0
4,4.35,228,0.019079,0


### Network & relationship features

In [9]:
# Distinct callees per caller (network size)
distinct_callees = df.groupby("caller_id")["callee_id"].nunique()
df["distinct_callees_count"] = df["caller_id"].map(distinct_callees)

# Reciprocal calls: check if callee also called back (basic reciprocity)
caller_callee_pairs = set(zip(df["caller_id"], df["callee_id"]))
df["is_reciprocal_call"] = df.apply(lambda x: (x["callee_id"], x["caller_id"]) in caller_callee_pairs, axis=1).astype(int)

print("Network-based features added")
df[["caller_id", "callee_id", "distinct_callees_count", "is_reciprocal_call"]].head()


Network-based features added


Unnamed: 0,caller_id,callee_id,distinct_callees_count,is_reciprocal_call
0,9438451486,9247462263,1,0
1,9117869328,9341768522,1,0
2,7619295089,9159834527,1,0
3,8967555928,9715071175,1,0
4,7792852021,8840717793,1,0


### Fraud rule-based pattern flags

In [10]:
df["is_short_and_international"] = ((df["call_duration"] < 10) & (df["is_international"] == 1)).astype(int)
df["is_high_cost_international"] = ((df["is_international"] == 1) & (df["is_high_cost_call"] == 1)).astype(int)
df["is_high_tower_switch"] = (df["tower_switch_rate"] > 0.7).astype(int)
df["is_device_swap_high"] = (df["imei_change_rate"] > df["imei_change_rate"].quantile(0.95)).astype(int)

print("Fraud pattern flags created")
df[["is_short_and_international", "is_high_cost_international", "is_high_tower_switch", "is_device_swap_high"]].head()


Fraud pattern flags created


Unnamed: 0,is_short_and_international,is_high_cost_international,is_high_tower_switch,is_device_swap_high
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


### Label encoding

In [11]:
from sklearn.preprocessing import LabelEncoder

le_type = LabelEncoder()
df["call_type_encoded"] = le_type.fit_transform(df["call_type"].astype(str))

# Binary label column
df["label_bin"] = df["label"].str.lower().map({"fraud": 1, "not fraud": 0})

print("Label encoding and final prep done")
df[["call_type", "call_type_encoded", "label", "label_bin"]].head()


Label encoding and final prep done


Unnamed: 0,call_type,call_type_encoded,label,label_bin
0,voice,3,Not Fraud,0
1,VoIP,0,Not Fraud,0
2,voice,3,Not Fraud,0
3,voice,3,Not Fraud,0
4,voice,3,Not Fraud,0


### Exporting 

In [14]:
output_path = "../dataset/CDR_feature_engineered.csv"

new_features = [
    col for col in df.columns
    if any(x in col for x in [
        "is_", "avg_", "ratio", "deviation", "count", "unique", "log", "cost", "tower", "short"
    ])
]

print("Total new features added:", len(new_features))
print("Sample engineered columns:\n", new_features[:15])

df.to_csv(output_path, index=False)
print(f"Feature-engineered dataset saved to: {output_path}")


Total new features added: 28
Sample engineered columns:
 ['call_duration', 'call_cost', 'cell_tower_id', 'cost_per_sec', 'tower_switch_rate', 'repeated_short_calls_last_1h', 'is_international', 'is_voip', 'is_roaming', 'is_night_call', 'avg_duration_by_caller', 'deviation_from_avg_duration', 'call_duration_log', 'unique_tower_count', 'tower_switch_rate_norm']
Feature-engineered dataset saved to: ../dataset/CDR_feature_engineered.csv
