In [49]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

import json
import pickle

import numpy as np
import pandas as pd


def one_hot_encode(pca,df_original, column):
    # Definir tamaño de los lotes
    batch_size = 10000  # Ajusta según el tamaño que la RAM permita

    # Inicializar lista para almacenar los resultados parciales
    results = []

    # Procesar el DataFrame en lotes
    for i in range(0, len(df_original), batch_size):
        # Crear lote
        df_batch = df_original.iloc[i:i + batch_size].copy()
        
        # Codificación one-hot para el lote
        df_batch = pd.get_dummies(df_batch, columns=[column])
        
        # Aplicar PCA transform en el lote
        result_batch = pca.transform(df_batch)
        
        # Guardar el resultado en la lista
        results.append(result_batch)

    # Concatenar todos los resultados
    result_full = np.vstack(results)
    return result_full

In [50]:

def PCA_one_hot(df,column, n_components,th=0.9):
    # One hot encoding

    df_original = df.copy()

    categories = df[column].cat.categories

    ind_random = np.random.permutation(100000)
    df = pd.DataFrame({column:df_original[column].iloc[ind_random]})

    df[column] = df[column].astype('category')
    df[column] = df[column].cat.set_categories(categories)
    df_one_hot = pd.get_dummies(df, columns=[column])

    # PCA
    pca = PCA(n_components=n_components)
    pca.fit(df_one_hot)
        

    er_cum = np.cumsum(pca.explained_variance_ratio_)
    # find pca.explained_variance_ratio_ > 0.05
    idx = np.where(er_cum > th)
    if  len(idx[0]) > 0:
        idx = idx[0][0]
    else:
        idx = n_components-1
    
    ncateg = df[column].cat.categories.shape[0]

    print(f"n_components = {idx+1} | n_categories = {ncateg}")

    n_components = idx+1


    df_full = pd.DataFrame({column:df_original[column]})
    result_full = one_hot_encode(pca,df_full, column)

    result_full = result_full[:, :n_components]


    df_new = pd.DataFrame(result_full, columns=[column+f"PC{i+1}" for i in range(n_components)])
    df_new.index = df_full.index

    for col in df_new.columns:
        df_original[col] = df_new[col]
    
    df_original.pop(column)

    fig = plt.figure()
    plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
    plt.xlabel("Principal component")
    plt.title("Cumulative explained variance ratio | n_components = "+str(n_components) + " | n_categories = "+str(ncateg) + " | columns = "+column)
    # horizontal line
    plt.axhline(y=th, color='r', linestyle='-')
    # vertical line
    plt.axvline(x=idx, color='r', linestyle='-')
    
    
    return df_original,pca, n_components
# 

In [51]:

def PCA_one_hot_apl(pca,df,column, n_components):
    # One hot encoding

    df_original = df.copy()

    df_full = pd.DataFrame({column:df_original[column]})
    result_full = one_hot_encode(pca,df_full, column)

    result_full = result_full[:, :n_components]


    df_new = pd.DataFrame(result_full, columns=[column+f"PC{i+1}" for i in range(n_components)])
    df_new.index = df_full.index

    for col in df_new.columns:
        df_original[col] = df_new[col]
    
    df_original.pop(column)

    
    return df_original
# 

In [52]:
metadata = pickle.load(open('metadata.pkl', 'rb'))

In [53]:
amount_mean = metadata['amount_mean']
amount_std = metadata['amount_std']
full_categories = metadata['full_categories']
pca_list = metadata['pca_list']
n_components_list = metadata['n_components_list']


In [54]:
data_prediction = json.load(open('../old/predictions_3.json'))

In [55]:
t_id_predict = list(data_prediction["target"].keys())


In [56]:
len(t_id_predict)

2890952

In [57]:
df_transactions_full = pd.read_csv('../data/transactions_data.csv')

df_transactions_full['is_fraud'] = df_transactions_full['id'].apply(lambda x: data_prediction["target"][str(x)] 
                                                                    if str(x) in data_prediction["target"] 
                                                                    else "unknown")
df_transactions_full = df_transactions_full[df_transactions_full['is_fraud'] != "unknown"]
df_transactions_full['is_fraud'] = df_transactions_full['is_fraud'].apply(lambda x: 1 if x == "Yes" else 0)

In [58]:
df_transactions_full = df_transactions_full.loc[:10000]

In [59]:
df_transactions_full["amount"] = df_transactions_full["amount"].apply(lambda x: x.replace("$", "")).astype(float)
df_transactions_full['amount'] = (df_transactions_full['amount'] - amount_mean) / amount_std

In [60]:
# date -> day, month, year and remove date
df_transactions_full['date'] = pd.to_datetime(df_transactions_full['date'])
df_transactions_full['day'] = df_transactions_full['date'].dt.day
df_transactions_full['month'] = df_transactions_full['date'].dt.month
df_transactions_full["hour"] = df_transactions_full["date"].dt.hour
df_transactions_full["weekday"] = df_transactions_full["date"].dt.weekday
df_transactions_full.pop("date")


3      2010-01-01 00:05:00
8      2010-01-01 00:21:00
9      2010-01-01 00:21:00
15     2010-01-01 00:31:00
34     2010-01-01 00:57:00
               ...        
9975   2010-01-04 06:06:00
9980   2010-01-04 06:09:00
9982   2010-01-04 06:10:00
9986   2010-01-04 06:11:00
9994   2010-01-04 06:12:00
Name: date, Length: 2176, dtype: datetime64[ns]

In [61]:
df_transactions_full["use_chip"] = df_transactions_full["use_chip"].astype('category')
df_transactions_full["use_chip"] = df_transactions_full["use_chip"].cat.set_categories(metadata["use_chip_categories"])

In [62]:
iter = 0
for col in ["merchant_id", "merchant_city", "merchant_state", "mcc"]:
    df_transactions_full[col] = df_transactions_full[col].astype('category')
    # current caterogies
    df_transactions_full[col] = df_transactions_full[col].cat.set_categories(full_categories[iter])

    # Nan -> others
    iter += 1

In [63]:
# id and client_id are not useful
id_trans = df_transactions_full.pop("id")
df_transactions_full.pop("client_id")
df_transactions_full.pop("zip")

3       46307.0
8           NaN
9       11355.0
15      78586.0
34       1801.0
         ...   
9975    75253.0
9980    80550.0
9982    43830.0
9986    19072.0
9994    28147.0
Name: zip, Length: 2176, dtype: float64

In [64]:
df_transactions_full.pop("errors")

3       NaN
8       NaN
9       NaN
15      NaN
34      NaN
       ... 
9975    NaN
9980    NaN
9982    NaN
9986    NaN
9994    NaN
Name: errors, Length: 2176, dtype: object

In [65]:
df_transactions_full["merchant_city"][df_transactions_full["merchant_city"].isna()] = "others"
df_transactions_full["merchant_state"][df_transactions_full["merchant_state"].isna()] = "others"
df_transactions_full["mcc"][df_transactions_full["mcc"].isna()] = "others"

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_transactions_full["merchant_city"][df_transactions_full["merchant_city"].isna()] = "others"
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) t

In [66]:
# Transformar los datos

iter = 0
for col in ["merchant_id", "merchant_city", "mcc","merchant_state"]:
    pca = pca_list[iter]
    n_components = n_components_list[iter]
    df_transactions_full =     PCA_one_hot_apl(pca,df_transactions_full,col, n_components)
    iter += 1

In [67]:
df_transactions_full.pop("card_id")

3       2860
8       5131
9       1112
15      2464
34        16
        ... 
9975    1110
9980    4194
9982    4938
9986    2579
9994    3840
Name: card_id, Length: 2176, dtype: int64

In [68]:
df_transactions_full = pd.get_dummies(df_transactions_full, columns=["use_chip"])

In [69]:
df_transactions_full

Unnamed: 0,amount,is_fraud,day,month,hour,weekday,merchant_idPC1,merchant_idPC2,merchant_idPC3,merchant_idPC4,...,merchant_statePC21,merchant_statePC22,merchant_statePC23,merchant_statePC24,merchant_statePC25,merchant_statePC26,merchant_statePC27,use_chip_Chip Transaction,use_chip_Online Transaction,use_chip_Swipe Transaction
3,1.911461,0,1,1,0,4,-0.181230,0.897744,0.285902,0.000586,...,0.008107,0.002786,-0.002744,-0.011396,-0.014099,-0.007377,-0.009067,False,False,True
8,2.660803,0,1,1,0,4,-0.082618,-0.030263,-0.054559,-0.000575,...,0.000097,0.000034,-0.000034,-0.000146,-0.000189,-0.000102,-0.000129,False,True,False
9,-0.391566,0,1,1,0,4,-0.082507,-0.030186,-0.054400,-0.000572,...,0.001353,0.000475,-0.000471,-0.002016,-0.002585,-0.001389,-0.001747,False,False,True
15,-0.508993,0,1,1,0,4,-0.093693,-0.039027,-0.073600,-0.001009,...,0.001221,0.000429,-0.000426,-0.001822,-0.002337,-0.001257,-0.001582,False,False,True
34,-0.486360,0,1,1,0,4,-0.113880,-0.063248,-0.136670,-0.696914,...,-0.032789,-0.012945,0.013463,0.070394,0.125205,0.096291,0.195582,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9975,-0.285699,0,4,1,6,0,-0.092939,-0.038355,-0.072078,-0.000965,...,0.001221,0.000429,-0.000426,-0.001822,-0.002337,-0.001257,-0.001582,False,False,True
9980,-1.325384,0,4,1,6,0,-0.098722,-0.043853,-0.084852,-0.001415,...,-0.091155,-0.045672,0.053801,0.790672,-0.492636,-0.122856,-0.104874,False,False,True
9982,-0.396799,0,4,1,6,0,-0.082562,-0.030224,-0.054479,-0.000574,...,0.001499,0.000526,-0.000522,-0.002232,-0.002859,-0.001536,-0.001930,False,False,True
9986,0.268702,0,4,1,6,0,-0.113950,-0.063360,-0.137005,0.717130,...,0.003560,0.001242,-0.001229,-0.005204,-0.006590,-0.003509,-0.004377,False,False,True


In [70]:
model = metadata['model']
scaler = metadata['scaler']

X = df_transactions_full.drop(columns=['is_fraud'])
X_test_scaled = scaler.transform(X)
y_pred = model.predict(X_test_scaled)

In [71]:
y_pred
# count frauds
np.sum(y_pred)

11

In [72]:
pred = pd.DataFrame({
    "id": id_trans
    ,"is_fraud": y_pred
})

In [75]:
pred.iloc[:10]

Unnamed: 0,id,is_fraud
3,7475331,0
8,7475336,0
9,7475337,0
15,7475343,0
34,7475364,0
39,7475370,0
43,7475374,0
56,7475391,0
59,7475396,0
60,7475397,0
