In [None]:
# this one causes the Ubuntu python environment new_ml_env to die

In [1]:
#pip install pandas numpy matplotlib seaborn scikit-learn 

In [2]:
import pandas as pd
import os

# Define the file paths
base_dir = '/home/john/ai/kaggle/iee-cis/'
train_transaction_file = os.path.join(base_dir, 'train_transaction.csv')
train_identity_file = os.path.join(base_dir, 'train_identity.csv')
test_transaction_file = os.path.join(base_dir, 'test_transaction.csv')
test_identity_file = os.path.join(base_dir, 'test_identity.csv')
sample_submission_file = os.path.join(base_dir, 'sample_submission.csv')

# Load the datasets
train_transaction = pd.read_csv(train_transaction_file)
train_identity = pd.read_csv(train_identity_file)
test_transaction = pd.read_csv(test_transaction_file)
test_identity = pd.read_csv(test_identity_file)
sample_submission = pd.read_csv(sample_submission_file)

# Display the first few rows of each dataset to understand their structure
train_transaction.head(), train_identity.head(), test_transaction.head(), test_identity.head(), sample_submission.head()


(   TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
 0        2987000        0          86400            68.5         W  13926   
 1        2987001        0          86401            29.0         W   2755   
 2        2987002        0          86469            59.0         W   4663   
 3        2987003        0          86499            50.0         W  18132   
 4        2987004        0          86506            50.0         H   4497   
 
    card2  card3       card4  card5  ... V330  V331  V332  V333  V334 V335  \
 0    NaN  150.0    discover  142.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
 1  404.0  150.0  mastercard  102.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
 2  490.0  150.0        visa  166.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
 3  567.0  150.0  mastercard  117.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
 4  514.0  150.0  mastercard  102.0  ...  0.0   0.0   0.0   0.0   0.0  0.0   
 
   V336  V337  V338  V339  
 0  NaN   NaN   NaN   NaN  
 1  

Merge Transaction and Identity Data
Since identity data is only available for a subset of transactions, you should merge the identity data with the transaction data.

In [3]:
# Merge identity data with transaction data
train_df = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test_df = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')


Handle Missing Values
Impute Missing Values: For numerical features, you can impute missing values with the median or mean. For categorical features, you can fill missing values with a placeholder (e.g., 'missing').

In [4]:
# Fill missing values for numerical columns with the median
for col in train_df.columns:
    if col in test_df.columns:  # Ensure the column exists in both train and test datasets
        if train_df[col].dtype != 'object':
            train_df[col].fillna(train_df[col].median(), inplace=True)
            test_df[col].fillna(test_df[col].median(), inplace=True)
    else:  # Handle the case where the column is not in the test set (like 'isFraud')
        if train_df[col].dtype != 'object':
            train_df[col].fillna(train_df[col].median(), inplace=True)

# Fill missing values for categorical columns with a placeholder
for col in train_df.columns:
    if col in test_df.columns:  # Ensure the column exists in both train and test datasets
        if train_df[col].dtype == 'object':
            train_df[col].fillna('missing', inplace=True)
            test_df[col].fillna('missing', inplace=True)
    else:  # Handle the case where the column is not in the test set (like 'isFraud')
        if train_df[col].dtype == 'object':
            train_df[col].fillna('missing', inplace=True)



Feature Engineering
Transaction Amount: Create log-transformed features or binning to reduce skewness.
Card Features: Consider creating new features like card1_card2_interaction by combining card1 and card2.

In [5]:
import numpy as np  # Add this import statement

# Log-transform the TransactionAmt
train_df['TransactionAmt_log'] = train_df['TransactionAmt'].apply(lambda x: np.log1p(x))
test_df['TransactionAmt_log'] = test_df['TransactionAmt'].apply(lambda x: np.log1p(x))

# Create interaction features
train_df['card1_card2_interaction'] = train_df['card1'].astype(str) + '_' + train_df['card2'].astype(str)
test_df['card1_card2_interaction'] = test_df['card1'].astype(str) + '_' + test_df['card2'].astype(str)



Encoding Categorical Variables
Label Encoding: For high cardinality categorical features like card1, card2, etc., consider label encoding.
One-Hot Encoding: For low cardinality features like card4, card6, you can use one-hot encoding.

In [6]:
from sklearn.preprocessing import LabelEncoder

# Label encode categorical columns with high cardinality
for col in ['card1', 'card2', 'card3', 'card5', 'card1_card2_interaction']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    
    # Transform test set with a fallback for unseen labels
    test_df[col] = test_df[col].map(lambda s: '<unknown>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<unknown>')  # Add an 'unknown' class for unseen labels
    test_df[col] = le.transform(test_df[col].astype(str))

# One-hot encode lower cardinality categorical columns
train_df = pd.get_dummies(train_df, columns=['card4', 'card6'])
test_df = pd.get_dummies(test_df, columns=['card4', 'card6'])

# Align columns in test set to match train set
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)



In [8]:
num_rows = train_df.shape[0]
print(f"The number of rows is: {num_rows}")


The number of rows is: 590540


Feature Selection
Correlation Analysis: You can remove features with low or no correlation with the target variable.
Variance Threshold: You may want to remove features with very low variance as they may not contribute much to the model. Select Only Numeric Columns for Correlation Analysis

In [9]:
# Ensure only numeric columns are used for correlation
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()

# Calculate correlations only for numeric columns
correlation = train_df[numeric_cols].corr()['isFraud'].abs()

# Set a threshold to identify columns with low correlation
correlation_threshold = 0.01
to_drop = correlation[correlation < correlation_threshold].index.tolist()

# Drop those columns from both train and test sets
train_df.drop(columns=to_drop, inplace=True)
test_df.drop(columns=to_drop, inplace=True)



Train-Test Split
Prepare your features and target variable for model training.

In [10]:
# Split features and target
X_train = train_df.drop(['isFraud', 'TransactionID'], axis=1)
y_train = train_df['isFraud']

X_test = test_df.drop(['isFraud', 'TransactionID'], axis=1)


 Model Training
Now you can proceed with training a model using your prepared data. Gradient Boosting models like LightGBM, XGBoost, or CatBoost are popular choices for this type of problem.

In [11]:
pip install --upgrade lightgbm

Note: you may need to restart the kernel to use updated packages.


In [12]:
import lightgbm as lgb


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Load your data
train_transaction = pd.read_csv('/home/john/ai/kaggle/iee-cis/train_transaction.csv')
train_identity = pd.read_csv('/home/john/ai/kaggle/iee-cis/train_identity.csv')

# Merge the identity data with transaction data
train_df = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

# Define target and features
y_train = train_df['isFraud']
X_train = train_df.drop(columns=['isFraud', 'TransactionID'])

# Fill missing values (example with a placeholder, or you can use other methods)
X_train.fillna(-999, inplace=True)

# Identify all categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))

# Perform the train-test split
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Now, run the grid search for RandomForest
rf_model = RandomForestClassifier()
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

rf_grid_search = GridSearchCV(rf_model, rf_params, cv=3, scoring='roc_auc', verbose=3, n_jobs=-1)
rf_grid_search.fit(X_train_split, y_train_split)

print(f"Best score for RandomForest: {rf_grid_search.best_score_}")
print(f"Best parameters for RandomForest: {rf_grid_search.best_params_}")


Fitting 3 folds for each of 8 candidates, totalling 24 fits


Code to Generate Submission File

3 models grid search pick the best one and create a submissions file

In [16]:
# pip install xgboost --timeout 100


In [18]:
#pip install catboost

In [19]:
import xgboost as xgb
import catboost as cb

 Run Grid Search for RandomForest

In [20]:
from sklearn.model_selection import GridSearchCV


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Load your data
train_transaction = pd.read_csv('/home/john/ai/kaggle/iee-cis/train_transaction.csv')
train_identity = pd.read_csv('/home/john/ai/kaggle/iee-cis/train_identity.csv')

# Merge the identity data with transaction data
train_df = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

# Define target and features
y_train = train_df['isFraud']
X_train = train_df.drop(columns=['isFraud', 'TransactionID'])

# Fill missing values (example with a placeholder, or you can use other methods)
X_train.fillna(-999, inplace=True)

# Identify all categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))

# Perform the train-test split
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Now, run the grid search for RandomForest
rf_model = RandomForestClassifier()
rf_params = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [2]
}

rf_grid_search = GridSearchCV(rf_model, rf_params, cv=3, scoring='roc_auc', verbose=3, n_jobs=-1)
rf_grid_search.fit(X_train_split, y_train_split)

print(f"Best score for RandomForest: {rf_grid_search.best_score_}")
print(f"Best parameters for RandomForest: {rf_grid_search.best_params_}")


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best score for RandomForest: 0.8668713936940357
Best parameters for RandomForest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
