In [1]:
# Credit Card Fraud Detection - Exploratory Data Analysis
# CodSoft ML Internship - Task 2
# Author: Chandan Kumar

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("CREDIT CARD FRAUD DETECTION - EXPLORATORY DATA ANALYSIS")
print("="*70)

CREDIT CARD FRAUD DETECTION - EXPLORATORY DATA ANALYSIS


In [2]:
# 1. LOAD DATASET

print("\nüìÇ Loading datasets...")
# Load training and test data
df_train = pd.read_csv('../data/fraudTrain.csv')
df_test = pd.read_csv('../data/fraudTest.csv')

print(f"‚úÖ Training data loaded: {df_train.shape}")
print(f"‚úÖ Test data loaded: {df_test.shape}")

# Combine for EDA (we'll split later for training)
df = pd.concat([df_train, df_test], axis=0, ignore_index=True)

print(f"‚úÖ Dataset loaded successfully!")
print(f"   Shape: {df.shape}")
print(f"   Rows: {df.shape[0]:,}")
print(f"   Columns: {df.shape[1]}")



üìÇ Loading datasets...
‚úÖ Training data loaded: (1296675, 23)
‚úÖ Test data loaded: (555719, 23)
‚úÖ Dataset loaded successfully!
   Shape: (1852394, 23)
   Rows: 1,852,394
   Columns: 23


In [3]:
# 2. INITIAL DATA INSPECTION

print("\n" + "="*70)
print("DATA OVERVIEW")
print("="*70)

print("\nüìä First 5 rows:")
print(df.head())

print("\nüìã Dataset Info:")
print(df.info())

print("\nüìà Statistical Summary:")
print(df.describe())

print("\nüîç Column Name")
print(df.columns.tolist())


DATA OVERVIEW

üìä First 5 rows:
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43

In [4]:
# 3. MISSING VALUES CHECK


print("\n" + "="*70)
print("MISSING VALUES ANALYSIS")
print("="*70)

missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_values,
    'Percentage': missing_percent
})

print(missing_df[missing_df['Missing_Count'] > 0])

if missing_values.sum() == 0:
    print("‚úÖ No missing values found!")
else:
    print(f"‚ö†Ô∏è  Total missing values: {missing_values.sum()}")


MISSING VALUES ANALYSIS
Empty DataFrame
Columns: [Missing_Count, Percentage]
Index: []
‚úÖ No missing values found!


In [5]:
# 4. CLASS DISTRIBUTION (FRAUD vs LEGITIMATE)

print("\n" + "="*70)
print("CLASS DISTRIBUTION ANALYSIS")
print("="*70)

class_counts = df['is_fraud'].value_counts()
class_percentages = df['is_fraud'].value_counts(normalize=True) * 100

print("\nüìä Transaction Distribution:")
print(f"   Legitimate (0): {class_counts[0]:,} ({class_percentages[0]:.4f}%)")
print(f"   Fraudulent (1): {class_counts[1]:,} ({class_percentages[1]:.4f}%)")

fraud_ratio = class_counts[1] / class_counts[0]
print(f"\n‚ö†Ô∏è  Imbalance Ratio: 1:{1/fraud_ratio:.0f}")
print(f"   (For every 1 fraud, there are {1/fraud_ratio:.0f} legitimate transactions)")

# Visualization - Class Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
axes[0].bar(['Legitimate', 'Fraudulent'], class_counts.values, 
            color=['green', 'red'], alpha=0.7)
axes[0].set_ylabel('Count')
axes[0].set_title('Transaction Class Distribution')
axes[0].set_yscale('log')  # Log scale due to imbalance
for i, v in enumerate(class_counts.values):
    axes[0].text(i, v, f'{v:,}', ha='center', va='bottom')

# Pie chart
colors = ['lightgreen', 'lightcoral']
axes[1].pie(class_counts.values, labels=['Legitimate', 'Fraudulent'], 
            autopct='%1.4f%%', colors=colors, startangle=90)
axes[1].set_title('Transaction Class Percentage')

plt.tight_layout()
plt.savefig('../images/class_distribution.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Visualization saved: ../images/class_distribution.png")
plt.close()


CLASS DISTRIBUTION ANALYSIS

üìä Transaction Distribution:
   Legitimate (0): 1,842,743 (99.4790%)
   Fraudulent (1): 9,651 (0.5210%)

‚ö†Ô∏è  Imbalance Ratio: 1:191
   (For every 1 fraud, there are 191 legitimate transactions)

‚úÖ Visualization saved: ../images/class_distribution.png


In [6]:
# 5. TIME ANALYSIS

print("\n" + "="*70)
print("TIME ANALYSIS")
print("="*70)

print(f"\n‚è±Ô∏è  Time Range:")
print(f"   Min: {df['unix_time'].min():.0f} seconds")
print(f"   Max: {df['unix_time'].max():.0f} seconds")
print(f"   Duration: {df['unix_time'].max() / 3600:.1f} hours")

# Convert time to hours
df['Time_Hour'] = df['unix_time'] / 3600
# Fraud distribution over time
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.hist(df[df['is_fraud'] == 0]['Time_Hour'], bins=50, alpha=0.7, 
         label='Legitimate', color='green')
plt.hist(df[df['is_fraud'] == 1]['Time_Hour'], bins=50, alpha=0.7, 
         label='Fraudulent', color='red')
plt.xlabel('Time (hours)')
plt.ylabel('Frequency')
plt.title('Transaction Distribution Over Time')
plt.legend()

plt.subplot(1, 2, 2)
fraud_by_hour = df[df['is_fraud'] == 1].groupby(df['Time_Hour'].astype(int)).size()
plt.plot(fraud_by_hour.index, fraud_by_hour.values, color='red', marker='o')
plt.xlabel('Time (hours)')
plt.ylabel('Fraud Count')
plt.title('Fraudulent Transactions Over Time')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../images/time_analysis.png', dpi=300, bbox_inches='tight')
print("‚úÖ Time analysis saved: ../images/time_analysis.png")
plt.close()


TIME ANALYSIS

‚è±Ô∏è  Time Range:
   Min: 1325376018 seconds
   Max: 1388534374 seconds
   Duration: 385704.0 hours
‚úÖ Time analysis saved: ../images/time_analysis.png


In [7]:
# 6. AMOUNT ANALYSIS

print("\n" + "="*70)
print("TRANSACTION AMOUNT ANALYSIS")
print("="*70)

print("\nüí∞ Amount Statistics:")
print(f"   Mean: ${df['amt'].mean():.2f}")
print(f"   Median: ${df['amt'].median():.2f}")
print(f"   Min: ${df['amt'].min():.2f}")
print(f"   Max: ${df['amt'].max():.2f}")
print(f"   Std Dev: ${df['amt'].std():.2f}")
print("\nüí∞ Amount by Class:")
for class_label in [0, 1]:
    class_name = "Legitimate" if class_label == 0 else "Fraudulent"
    class_data = df[df['is_fraud'] == class_label]['amt']
    print(f"\n   {class_name}:")
    print(f"      Mean: ${class_data.mean():.2f}")
    print(f"      Median: ${class_data.median():.2f}")
    print(f"      Max: ${class_data.max():.2f}")

# Visualization - Amount Distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Overall distribution
axes[0, 0].hist(df['amt'], bins=50, color='blue', alpha=0.7)
axes[0, 0].set_xlabel('Amount ($)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Overall Amount Distribution')
axes[0, 0].set_xlim([0, 500])  # Focus on common range

# Legitimate vs Fraudulent
axes[0, 1].hist(df[df['is_fraud'] == 0]['amt'], bins=50, alpha=0.7, 
                label='Legitimate', color='green')
axes[0, 1].hist(df[df['is_fraud'] == 1]['amt'], bins=50, alpha=0.7, 
                label='Fraudulent', color='red')
axes[0, 1].set_xlabel('Amount ($)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Amount Distribution by Class')
axes[0, 1].set_xlim([0, 500])
axes[0, 1].legend()

# Box plots
box_data = [df[df['is_fraud'] == 0]['amt'], df[df['is_fraud'] == 1]['amt']]
axes[1, 0].boxplot(box_data, labels=['Legitimate', 'Fraudulent'])
axes[1, 0].set_ylabel('Amount ($)')
axes[1, 0].set_title('Amount Distribution (Box Plot)')
axes[1, 0].set_ylim([0, 500])

# Log scale comparison
axes[1, 1].hist(np.log1p(df[df['is_fraud'] == 0]['amt']), bins=50, 
                alpha=0.7, label='Legitimate', color='green')
axes[1, 1].hist(np.log1p(df[df['is_fraud'] == 1]['amt']), bins=50, 
                alpha=0.7, label='Fraudulent', color='red')
axes[1, 1].set_xlabel('Log(Amount + 1)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Amount Distribution (Log Scale)')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('../images/amount_analysis.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Amount analysis saved: ../images/amount_analysis.png")
plt.close()


TRANSACTION AMOUNT ANALYSIS

üí∞ Amount Statistics:
   Mean: $70.06
   Median: $47.45
   Min: $1.00
   Max: $28948.90
   Std Dev: $159.25

üí∞ Amount by Class:

   Legitimate:
      Mean: $67.65
      Median: $47.24
      Max: $28948.90

   Fraudulent:
      Mean: $530.66
      Median: $390.00
      Max: $1376.04

‚úÖ Amount analysis saved: ../images/amount_analysis.png


In [9]:
# ============================================================
# 7. FEATURE ANALYSIS (NUMERICAL FEATURES)
# ============================================================

print("\n" + "="*70)
print("NUMERICAL FEATURE ANALYSIS")
print("="*70)

# Select numerical features (excluding target)
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
numeric_features = numeric_features.drop('is_fraud')

print(f"\nüìä Number of numerical features: {len(numeric_features)}")

# Statistical summary
print("\nüìà Numerical Features Statistics:")
print(df[numeric_features].describe())

# Correlation with fraud
print("\nüîç Top 10 Features Correlated with Fraud:")
correlations = df[numeric_features].corrwith(df['is_fraud']).abs().sort_values(ascending=False)
print(correlations.head(10))

# Visualization - Top correlated features
top_features = correlations.head(6).index.tolist()

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, feature in enumerate(top_features):
    axes[idx].hist(
        df[df['is_fraud'] == 0][feature],
        bins=50, alpha=0.7, label='Legitimate', density=True
    )
    axes[idx].hist(
        df[df['is_fraud'] == 1][feature],
        bins=50, alpha=0.7, label='Fraudulent', density=True
    )
    axes[idx].set_title(f'{feature}\nCorr: {correlations[feature]:.3f}')
    axes[idx].set_xlabel('Value')
    axes[idx].legend()

plt.tight_layout()
plt.savefig('../images/top_numerical_features.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Feature analysis saved: ../images/top_numerical_features.png")
plt.close()


NUMERICAL FEATURE ANALYSIS

üìä Number of numerical features: 11

üìà Numerical Features Statistics:
         Unnamed: 0        cc_num           amt           zip           lat  \
count  1.852394e+06  1.852394e+06  1.852394e+06  1.852394e+06  1.852394e+06   
mean   5.371934e+05  4.173860e+17  7.006357e+01  4.881326e+04  3.853931e+01   
std    3.669110e+05  1.309115e+18  1.592540e+02  2.688185e+04  5.071470e+00   
min    0.000000e+00  6.041621e+10  1.000000e+00  1.257000e+03  2.002710e+01   
25%    2.315490e+05  1.800429e+14  9.640000e+00  2.623700e+04  3.466890e+01   
50%    4.630980e+05  3.521417e+15  4.745000e+01  4.817400e+04  3.935430e+01   
75%    8.335758e+05  4.642255e+15  8.310000e+01  7.204200e+04  4.194040e+01   
max    1.296674e+06  4.992346e+18  2.894890e+04  9.992100e+04  6.669330e+01   

               long      city_pop     unix_time     merch_lat    merch_long  \
count  1.852394e+06  1.852394e+06  1.852394e+06  1.852394e+06  1.852394e+06   
mean  -9.022783e+01  8.864

In [11]:
# 8. CORRELATION MATRIX (Numeric Features Only)

print("\n" + "="*70)
print("CORRELATION ANALYSIS")
print("="*70)

# Select numeric columns only
numeric_df = df.select_dtypes(include=['int64', 'float64'])

# Ensure target exists
assert 'is_fraud' in numeric_df.columns, "Target column 'is_fraud' not found"

# Correlation with target
feature_correlations = (
    numeric_df
    .corr()['is_fraud']
    .abs()
    .sort_values(ascending=False)
)

print("\nüî• Features Most Correlated with Fraud:")
print(feature_correlations.drop('is_fraud').head(15))

# Select top features (+ target)
top_features = feature_correlations.drop('is_fraud').head(15).index.tolist()
top_features.append('is_fraud')

# Correlation matrix
correlation_matrix = numeric_df[top_features].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt='.2f',
    cmap='coolwarm',
    center=0,
    square=True,
    linewidths=0.5
)

plt.title('Correlation Matrix - Top Features vs Fraud')
plt.tight_layout()

# Ensure directory exists before saving
import os
os.makedirs('../images', exist_ok=True)

plt.savefig('../images/correlation_matrix.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Correlation matrix saved: ../images/correlation_matrix.png")

plt.close()


CORRELATION ANALYSIS

üî• Features Most Correlated with Fraud:
amt           0.209308
unix_time     0.013329
Time_Hour     0.013329
lat           0.002904
merch_lat     0.002778
zip           0.002190
cc_num        0.001125
long          0.001022
merch_long    0.000999
Unnamed: 0    0.000524
city_pop      0.000325
Name: is_fraud, dtype: float64

‚úÖ Correlation matrix saved: ../images/correlation_matrix.png


In [None]:
# 9. DATA SCALING PREPARATION

print("\n" + "="*70)
print("DATA SCALING")
print("="*70)

print("\nüîÑ Scaling 'Time' and 'Amount' features...")

# Create scaled version
df_scaled = df.copy()

scaler = StandardScaler()
df_scaled['amt'] = scaler.fit_transform(df_scaled['amt'].values.reshape(-1, 1))
df_scaled['unix_time'] = scaler.fit_transform(df_scaled['unix_time'].values.reshape(-1, 1))

print("‚úÖ Features scaled successfully!")

# Save scaler for later use
import joblib
joblib.dump(scaler, '../artifacts/scaler.pkl')
print("‚úÖ Scaler saved: ../artifacts/scaler.pkl")

# Save scaled data
df_scaled.to_csv('../data/creditcard_scaled.csv', index=False)
print("‚úÖ Scaled data saved: ../data/creditcard_scaled.csv")


DATA SCALING

üîÑ Scaling 'Time' and 'Amount' features...
‚úÖ Features scaled successfully!
‚úÖ Scaler saved: ../artifacts/scaler.pkl
‚úÖ Scaled data saved: ../artifacts/creditcard_scaled.csv


In [14]:
# 10. SUMMARY STATISTICS

print("\n" + "="*70)
print("SUMMARY REPORT")
print("="*70)

summary = {
    'Total Transactions': len(df),
    'Legitimate Transactions': int(class_counts[0]),
    'Fraudulent Transactions': int(class_counts[1]),
    'Fraud Percentage': f"{class_percentages[1]:.4f}%",
    'Imbalance Ratio': f"1:{1/fraud_ratio:.0f}",
    'Number of Features': len(df.columns) - 1,
    'Time Range (hours)': f"{df['unix_time'].max() / 3600:.1f}",
    'Average Amount': f"${df['amt'].mean():.2f}",
    'Max Amount': f"${df['amt'].max():.2f}",
    'Top Correlated Feature': correlations.head(1).index[0],
    'Top Correlation Value': f"{correlations.head(1).values[0]:.4f}"
}

print("\nüìä Dataset Summary:")
for key, value in summary.items():
    print(f"   {key}: {value}")

# Save summary
import json
with open('../artifacts/eda_summary.json', 'w') as f:
    json.dump(summary, f, indent=4)
print("\n‚úÖ Summary saved: ../artifacts/eda_summary.json")

# 11. KEY INSIGHTS

print("\n" + "="*70)
print("KEY INSIGHTS")
print("="*70)

print("""
üîç Key Findings:

1. SEVERE CLASS IMBALANCE
   - Only 0.17% of transactions are fraudulent
   - This requires special handling (SMOTE, class weights, etc.)

2. TIME PATTERNS
   - Fraudulent transactions show different time patterns
   - Some hours have higher fraud rates

3. AMOUNT DIFFERENCES
   - Fraudulent transactions tend to have different amount patterns
   - Most frauds are in specific amount ranges

4. PCA FEATURES
   - Several V features show strong correlation with fraud
   - Features like V14, V17, V12, V10 are most predictive

5. SCALING NEEDED
   - Time and Amount need scaling (already done)
   - V features are already scaled from PCA

6. MODEL RECOMMENDATIONS
   - Use techniques for imbalanced data (SMOTE, undersampling)
   - Focus on Precision-Recall over Accuracy
   - Consider ensemble methods
   - Use class weights in models
""")

print("\n" + "="*70)
print("‚úÖ EXPLORATORY DATA ANALYSIS COMPLETED!")
print("="*70)

print("\nüìÅ Generated Files:")
print("   ‚úÖ ../artifacts/class_distribution.png")
print("   ‚úÖ ../artifacts/time_analysis.png")
print("   ‚úÖ ../artifacts/amount_analysis.png")
print("   ‚úÖ ../artifacts/top_features.png")
print("   ‚úÖ ../artifacts/correlation_matrix.png")
print("   ‚úÖ ../artifacts/scaler.pkl")
print("   ‚úÖ ../artifacts/eda_summary.json")
print("   ‚úÖ ../data/creditcard_scaled.csv")

print("\nüöÄ Next Steps:")
print("   1. Run model_training.ipynb to build fraud detection models")
print("   2. Focus on handling class imbalance")
print("   3. Optimize for Precision-Recall metrics")


SUMMARY REPORT

üìä Dataset Summary:
   Total Transactions: 1852394
   Legitimate Transactions: 1842743
   Fraudulent Transactions: 9651
   Fraud Percentage: 0.5210%
   Imbalance Ratio: 1:191
   Number of Features: 23
   Time Range (hours): 385704.0
   Average Amount: $70.06
   Max Amount: $28948.90
   Top Correlated Feature: amt
   Top Correlation Value: 0.2093

‚úÖ Summary saved: ../artifacts/eda_summary.json

KEY INSIGHTS

üîç Key Findings:

1. SEVERE CLASS IMBALANCE
   - Only 0.17% of transactions are fraudulent
   - This requires special handling (SMOTE, class weights, etc.)

2. TIME PATTERNS
   - Fraudulent transactions show different time patterns
   - Some hours have higher fraud rates

3. AMOUNT DIFFERENCES
   - Fraudulent transactions tend to have different amount patterns
   - Most frauds are in specific amount ranges

4. PCA FEATURES
   - Several V features show strong correlation with fraud
   - Features like V14, V17, V12, V10 are most predictive

5. SCALING NEEDED
   