In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score
from sklearn.impute import SimpleImputer

# Load the dataset
df_test = pd.read_csv("fraudTrain.csv", encoding='ascii')

# Check the first few rows of the dataset to understand its structure
print(df_test.head())

# Next, we'll check the data types of the columns to ensure they are compatible with the operations we perform.
print(df_test.dtypes)

# After loading the dataset, let's check if there are any missing values
print(df_test.isnull().sum())

# Now, let's proceed with the preprocessing steps: removing unwanted columns and transforming categorical features.

# Function to remove unwanted columns
def remove_unwanted_columns(df):
    return df.drop(columns=['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'trans_num', 'state', 'city'])

# Function to transform categorical features
def transform_categorical_features(df):
    le = LabelEncoder()
    for feature in ['merchant', 'category', 'first', 'last', 'gender', 'street', 'job', 'dob']:
        df[feature] = le.fit_transform(df[feature])
    return df

# Apply preprocessing steps
df_test = transform_categorical_features(remove_unwanted_columns(df_test))

# Now, let's check the first few rows of the processed dataset to ensure the preprocessing steps were applied correctly
print(df_test.head())

# Now, we'll proceed with preparing the data for training, training the model, and evaluating it.
# We'll do this step by step to identify any potential issues.

# Function to prepare data
def prepare_data(df):
    X = df.drop(columns=['is_fraud'])
    y = df['is_fraud']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Impute missing values in features
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(X_train)
    X_val_imputed = imputer.transform(X_val)

    # Drop rows with missing target values
    non_nan_indices_train = ~pd.isnull(y_train)
    non_nan_indices_val = ~pd.isnull(y_val)

    X_train_imputed = X_train_imputed[non_nan_indices_train]
    y_train = y_train[non_nan_indices_train]

    X_val_imputed = X_val_imputed[non_nan_indices_val]
    y_val = y_val[non_nan_indices_val]

    return X_train_imputed, X_val_imputed, y_train, y_val

# Function to evaluate the model
def evaluate_model(model, X, y):
    prediction = model.predict(X)
    accuracy = accuracy_score(y, prediction)
    precision = precision_score(y, prediction)
    return accuracy, precision

# Preprocessing
X_train, X_val, y_train, y_val = prepare_data(df_test)

# Model training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluation
test_accuracy, test_precision = evaluate_model(model, X_val, y_val)
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
