# Define Packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


ModuleNotFoundError: No module named 'xgboost'

# Data load and check both files

In [18]:
train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")

# print the first data to check the data type
print(train.head())
print("train size : ", train.shape)  # Check each row data type

print("test size : ", test.shape)

        id                         trans_num  trans_date trans_time  \
0   308467  26ad750c2ff71f32631b58913582d70a  2024-01-10   06:49:39   
1   261578  fea9c1efe3f2b97f27ad0ab5409ec861  2024-01-06   02:37:50   
2      341  2ae350b982be840f3666273e0c2f3a05  2024-01-18   21:40:21   
3  1147639  bbdd8adfc0a34ed0e817f809193c85c0  2024-01-21   16:20:15   
4   314152  fc7756004dc2a9bc450eb894a670b804  2024-01-21   19:36:26   

    unix_time        category     amt            cc_num    first     last  \
0  1704887379        misc_pos  188.38      676355457570   Andrea  Johnson   
1  1704526670     grocery_pos  102.63   377178373574671   Rhonda   Chavez   
2  1705632021   entertainment    1.62  3599292013370451  Stephen     Khan   
3  1705872015  health_fitness    5.64  3594292572430345   Justin   Reilly   
4  1705883786  health_fitness   97.09  4867547663675548    Alice   Duarte   

   ...    zip      lat      long city_pop                          job  \
0  ...  62220  38.5127  -89.9847    

What Model Should I Use?

# SVM

# Preprocessing

In [19]:
# Get rid of unnecessary column
train = train.drop(['id', 'trans_num', 'cc_num', 'first', 'last', 'street', 'trans_date', 'unix_time'], axis=1)
test = test.drop(['trans_num', 'cc_num', 'first', 'last', 'street', 'trans_date', 'unix_time'], axis=1)

from datetime import datetime
# Convert dab to age
current_year = datetime.now().year
train['dob'] = pd.to_datetime(train['dob'])
test['dob'] = pd.to_datetime(test['dob'])

train['age'] = current_year - train['dob'].dt.year
test['age'] = current_year - test['dob'].dt.year

# convert trans_time to numerical (hour)
train['trans_time'] = pd.to_datetime(train['trans_time'], format='%H:%M:%S').dt.hour
test['trans_time'] = pd.to_datetime(test['trans_time'], format='%H:%M:%S').dt.hour

# 변환된 'age'와 'trans_time'을 사용 후 원래 열 제거
train = train.drop(['dob'], axis=1)
test = test.drop(['dob'], axis=1)

# Preprocessing categorical Data

In [20]:
#TODO test data one - hot - encoding
# one hoe encoding categorical column
# train['job'] = train['job'].str.lower()
# test['job'] = test['job'].str.lower()

# train['job'] = train['job'].str.strip()
# test['job'] = test['job'].str.strip()

# from sklearn.preprocessing import LabelEncoder
# label_encoder = LabelEncoder()
# train['job_label'] = label_encoder.fit_transform(train['job'])
# test['job_label'] = label_encoder.transform(test['job'])

# train['merchant'] = train['merchant'].str.replace(r'^fraud_', '', regex=True)
# test['merchant'] = test['merchant'].str.replace(r'^fraud_', '', regex=True)

def category_preprocessing(train, test, column):
    train[column] = train[column].str.lower()
    test[column] = test[column].str.lower()

    train[column] = train[column].str.strip()
    test[column] = test[column].str.strip()

    label_encoder = LabelEncoder()
    train[column] = label_encoder.fit_transform(train[column])
    test[column] = test[column].apply(lambda x: label_encoder.transform([x])[0] if x in label_encoder.classes_ else -1)
    
    return train, test

train, test = category_preprocessing(train, test, 'category')
train, test = category_preprocessing(train, test, 'job')

train['merchant'] = train['merchant'].str.replace(r'^fraud_', '', regex=True)
test['merchant'] = test['merchant'].str.replace(r'^fraud_', '', regex=True)

train, test = category_preprocessing(train, test, 'merchant')
train, test = category_preprocessing(train, test, 'city')
train, test = category_preprocessing(train, test, 'state')

train = pd.get_dummies(train, columns=['gender'], drop_first=True)
test = pd.get_dummies(test, columns=['gender'], drop_first=True)

print("After Preprocessing Train size : ", train.shape)

After Preprocessing Train size :  (370703, 16)


In [21]:
from sklearn.preprocessing import StandardScaler

# numerical feature
numeric_features = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'age', 'trans_time']

# 숫자형 데이터를 표준화
scaler = StandardScaler()
train[numeric_features] = scaler.fit_transform(train[numeric_features])

In [28]:
train_sample = train.sample(n=100000, random_state=42)  # random_state를 설정하면 재현 가능

In [36]:
# 타겟 변수 분리
X = train.drop('is_fraud', axis=1)
y = train['is_fraud']

In [29]:
# 타겟 변수 분리
X = train_sample.drop('is_fraud', axis=1)
y = train_sample['is_fraud']

In [37]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the SVM Model

In [38]:
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)

# Make Prediction

In [39]:
# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model

In [40]:
# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[65592     0]
 [ 8549     0]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     65592
           1       0.00      0.00      0.00      8549

    accuracy                           0.88     74141
   macro avg       0.44      0.50      0.47     74141
weighted avg       0.78      0.88      0.83     74141



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
