In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import sklearn
import gc
import seaborn as sns 

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

## Train Dataset  

In [None]:
train_identity = pd.read_csv("../input/ieee-fraud-detection/train_identity.csv")
gc.collect()

In [None]:
train_transaction = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv")


## Merging two of the dataset together

In [None]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')


In [None]:
del train_identity, train_transaction
gc.collect()

In [None]:
train.info()

In [None]:
print(len(train.columns.tolist()))

In [None]:
y = train['isFraud']

In [None]:
train_id = train['TransactionID']


After The EDA we find these are the features selected for future computation

In [None]:
new_features = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card5','card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain''R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9','C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5','D10', 'D11', 'D15', 'M1', 'M2', 'M3', 'M4', 'M6', 'M7', 'M8','M9', 'V1', 'V3', 'V4', 'V6', 'V8', 'V11', 'V13', 'V14', 'V17','V20', 'V23', 'V26', 'V27', 'V30', 'V36', 'V37', 'V40', 'V41','V44', 'V47', 'V48', 'V54', 'V56', 'V59', 'V62', 'V65', 'V67','V68', 'V70', 'V76', 'V78', 'V80', 'V82', 'V86', 'V88', 'V89','V91', 'V107', 'V108', 'V111', 'V115', 'V117', 'V120', 'V121','V123', 'V124', 'V127', 'V129', 'V130', 'V136', 'V138', 'V139','V142', 'V147', 'V156', 'V160', 'V162', 'V165', 'V166', 'V169','V171', 'V173', 'V175', 'V176', 'V178', 'V180', 'V182', 'V185','V187', 'V188', 'V198', 'V203', 'V205', 'V207', 'V209', 'V210','V215', 'V218', 'V220', 'V221', 'V223', 'V224', 'V226', 'V228','V229', 'V234', 'V235', 'V238', 'V240', 'V250', 'V252', 'V253','V257', 'V258', 'V260', 'V261', 'V264', 'V266', 'V267', 'V271','V274', 'V277', 'V281', 'V283', 'V284', 'V285', 'V286', 'V289','V291', 'V294', 'V296', 'V297', 'V301', 'V303', 'V305', 'V307','V309', 'V310', 'V314', 'V320', 'id_01', 'id_02', 'id_03', 'id_04','id_05', 'id_06', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13','id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_28','id_29', 'id_31', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType','DeviceInfo']

In [None]:
for features in train.columns: 
    if features not in new_features: 
        train = train.drop(features, axis = 1)
gc.collect()

In [None]:
len(train.columns.tolist())
#print(train.columns.tolist())
gc.collect()

In [None]:
pd. set_option("display.max_columns", None)
pd. set_option("display.max_rows", None)
gc.collect()

In [None]:
train.head()


In [None]:
gc.collect()

# Numerical Variables

# Train 

In [None]:
numerical_features = [feature for feature in train.columns if train[feature].dtypes != 'O']

print('Number of numerical variables: ', len(numerical_features))

# visualise the numerical variables
train[numerical_features].head()
gc.collect()

In [None]:
for feature in numerical_features: 
    ## We will replace by using median since there are outliers
    mean_value= train[feature].mean()
    
    ## create a new feature to capture nan values
    train[feature+'nan']=np.where(train[feature].isnull(),1,0)
    train[feature].fillna(mean_value,inplace=True)
gc.collect()

In [None]:
#train[numerical_features].isnull().sum()

In [None]:
df_num_train = train[numerical_features]
gc.collect()

# Categorical Variables

## Train 
### Categorical Features

In [None]:
categorical_features = [feature for feature in train.columns if train[feature].dtypes == 'O']

print('Number of categorical variables: ', len(categorical_features)) 

# visualise the numerical variables
df_cat_train = train[categorical_features]


In [None]:
df_cat_train.head()
gc.collect()

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
gc.collect()

In [None]:
for features in df_cat_train.columns: 
    imp = imp.fit(df_cat_train[[features]])
    #train[categorical_features] = imp.transform(train[categorical_features])

    df_cat_train[features]= imp.transform(df_cat_train[[features]])
    gc.collect()

In [None]:
df_cat_train.head()

In [None]:
#columns = df_cat.columns.tolist() 
#a = pd.get_dummies(df_cat, columns = columns)
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [None]:
for features in df_cat_train.columns: 
    df_cat_train[features]= label_encoder.fit_transform(df_cat_train[[features]])
gc.collect()

In [None]:
df_cat_train.head()

In [None]:
del train, imp 
gc.collect()

In [None]:
main_train = df_cat_train.merge(df_num_train, on= df_num_train.index ,how='inner' )

In [None]:
main_train = main_train.drop("key_0", axis = 1)
gc.collect()

In [None]:
len(main_train.columns.tolist())
gc.collect()

In [None]:
main_train.head()

In [None]:
#main_train.isnull().sum()

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
main_train = pd.DataFrame(sc.fit_transform(main_train), columns=main_train.columns)

In [None]:
del df_cat_train,numerical_features,df_num_train,sc
gc.collect()

In [None]:
main_train.head()
gc.collect()

In [None]:
main_train = reduce_mem_usage(main_train)
gc.collect()

In [None]:
len(main_train.columns.tolist())
gc.collect()

In [None]:
main_train.info()
gc.collect()

##  Test


In [None]:
test_identity = pd.read_csv("../input/ieee-fraud-detection/test_identity.csv")
test_transaction = pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
gc.collect()

In [None]:
del test_transaction, test_identity
gc.collect()

In [None]:
test = test.rename(columns={"id-01": "id_01", "id-02": "id_02", "id-03": "id_03", 
                            "id-06": "id_06", "id-05": "id_05", "id-04": "id_04", 
                            "id-07": "id_07", "id-08": "id_08", "id-09": "id_09", 
                            "id-10": "id_10", "id-11": "id_11", "id-12": "id_12", 
                            "id-15": "id_15", "id-14": "id_14", "id-13": "id_13", 
                            "id-16": "id_16", "id-17": "id_17", "id-18": "id_18", 
                            "id-21": "id_21", "id-20": "id_20", "id-19": "id_19", 
                            "id-22": "id_22", "id-23": "id_23", "id-24": "id_24", 
                            "id-27": "id_27", "id-26": "id_26", "id-25": "id_25", 
                            "id-28": "id_28", "id-29": "id_29", "id-30": "id_30", 
                            "id-31": "id_31", "id-32": "id_32", "id-33": "id_33", 
                            "id-34": "id_34", "id-35": "id_35", "id-36": "id_36", 
                            "id-37": "id_37", "id-38": "id_38"})
test.head()
gc.collect()

In [None]:
test_id = test['TransactionID']
gc.collect()

In [None]:
for features in test.columns: 
    if features not in new_features: 
        test = test.drop(features, axis = 1)
gc.collect()


In [None]:
test.head()
gc.collect()

In [None]:
for features in test.columns: 
    if features not in new_features: 
        test = test.drop(features, axis = 1)
        gc.collect()


In [None]:
numerical_features = [feature for feature in test.columns if test[feature].dtypes != 'O']

print('Number of numerical variables: ', len(numerical_features))

# visualise the numerical variables
test[numerical_features].head()
gc.collect()


In [None]:
for feature in numerical_features: 
    ## We will replace by using median since there are outliers
    mean_value= test[feature].mean()
    
    ## create a new feature to capture nan values
    test[feature+'nan']=np.where(test[feature].isnull(),1,0)
    test[feature].fillna(mean_value,inplace=True)

df_num_test = test[numerical_features]
gc.collect()


In [None]:
categorical_features = [feature for feature in test.columns if test[feature].dtypes == 'O']

print('Number of categorical_features: ', len(categorical_features))

# visualise the numerical variables
df_cat_test = test[categorical_features] 
gc.collect()

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
for features in df_cat_test.columns: 
    imp = imp.fit(df_cat_test[[features]])
    #train[categorical_features] = imp.transform(train[categorical_features])

    df_cat_test[features] = imp.transform(df_cat_test[[features]])
gc.collect()

In [None]:
for features in df_cat_test.columns: 
    df_cat_test[features]= label_encoder.fit_transform(df_cat_test[[features]])
gc.collect()    

In [None]:
df_cat_test.head()
gc.collect()

In [None]:
main_test = df_cat_test.merge(df_num_test, on = df_num_test.index ,how='inner' )
main_test = main_test.drop("key_0", axis = 1)
gc.collect()

In [None]:
main_test.head()
gc.collect()

In [None]:
len(main_test.columns.tolist())
gc.collect()

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
main_test = pd.DataFrame(sc.fit_transform(main_test), columns=main_test.columns)
gc.collect()

In [None]:
main_test.head()
gc.collect()

In [None]:
main_test = reduce_mem_usage(main_test)
gc.collect()

In [None]:
del df_cat_test,df_num_test,sc, imp, test,categorical_features
gc.collect()

In [None]:
main_test.head()

In [None]:
print(len(main_test.columns.tolist()))
print(len(main_train.columns.tolist()))

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(main_train , y, test_size=0.20, random_state=20)

In [None]:
del main_train
gc.collect()

In [None]:
y_train.isnull().sum()

In [None]:
y_train.head()

# Model Creation

In [None]:
import tensorflow
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dropout
#from keras.callbacks import LearningRateScheduler
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
# detect and init the TPU





# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
# instantiating the model in the strategy scope creates the model on the TPU

model = Sequential()
#2nd hidden layer with 8 nodes
model.add(Dense(50, activation='relu', kernel_initializer='he_normal',input_shape=(186,)))
#model.add(Dropout(0.5))
model.add(Dense(25, activation='relu', kernel_initializer='he_normal'))
#model.add(Dropout(0.5))
model.add(Dense(12, activation='relu', kernel_initializer='he_normal'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
#output layer with 1 node(neuron)
model.add(Dense(1,activation='sigmoid'))


# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
model.summary()


In [None]:
#es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=25)
#mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.fit(x_train,y_train,validation_split= 0.1,epochs=1000,batch_size=32, verbose=2)
# evaluate the model
loss, acc = model.evaluate(x_test,y_test, verbose=2)
print('Test Accuracy: %.5f' % acc)

In [None]:
predictions = model.predict(main_test)

In [None]:
model.save('./model_2.h5')

In [None]:
predictions

In [None]:
pred = predictions.ravel()

In [None]:
pred

In [None]:
my_submission = pd.DataFrame({'TransactionID': test_id, 'isFraud': pred})
# you could use any filename. We choose submission here
my_submission.to_csv('./submission.csv', index=False)