# **Imports & Kaggle Environment Setup**



In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'fraude-em-transaes-de-carto-de-crdito:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F68419%2F7896093%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240522%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240522T173141Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1acf9a863fb89957368ada40a385a7d39d49440930facb51de1cb02a8ba0ea0bd9b3ce486edd2dd42e8f0fe63a2d7b37a225bf1a440c894ff9c817786176c783e48fb478783ee510ece8f68a9887e316080b95f424accacc35f44398854bd5a5d3590fe1e5eda466201ba5e7995ed5051b6d84d3e13bef74f12efc4a98f920f68561aac2f8680d319372085c507d37f3f7b81a0d2efb127301c74192d88d13d5e2f729331b565fe8e00e4cbe7581a5939b13c661c656d221b8afbd80970ae3ab2d6a8cc728d9cff134df56935e519e4272847e5bea440326f04eb85e97539019b37c704e48df028d089d720dd5716caf81d57a3d1bfd9e5997299bec31a4df57'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')

Downloading fraude-em-transaes-de-carto-de-crdito, 41048154 bytes compressed
Downloaded and uncompressed: fraude-em-transaes-de-carto-de-crdito
Data source import complete.


# **Download and Extract Data**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

teste = pd.read_csv("/kaggle/input/fraude-em-transaes-de-carto-de-crdito/teste.csv.xz",sep='|', compression='xz')
treino = pd.read_csv("/kaggle/input/fraude-em-transaes-de-carto-de-crdito/treino.csv.xz",sep='|', compression='xz')

/kaggle/input/fraude-em-transaes-de-carto-de-crdito/teste.csv.xz
/kaggle/input/fraude-em-transaes-de-carto-de-crdito/treino.csv.xz
/kaggle/input/fraude-em-transaes-de-carto-de-crdito/exemplo_submissao.csv


In [None]:
teste.head()

Unnamed: 0,ssn,cc_num,first,last,gender,street,city,state,zip,lat,...,profile,trans_num,trans_date,trans_time,unix_time,category,amt,merchant,merch_lat,merch_long
0,359-72-3479,676334414486,Krista,Wang,F,556 Marilyn Fields,Saint Louis,MO,,38.63,...,adults_50up_female_urban.json,a4194096c6cc870b2e21b2b2f69a7706,2023-07-09,07:53:29,1688900000.0,shopping_net,50.96,"fraud_Ruecker, Beer and Collier",,
1,145-16-0685,676296881433,Nicole,Berger,F,68222 Christina Glen Apt. 129,Celina,OH,,40.56,...,adults_2550_female_urban.json,f12645668a1a5a9f8192687f56095b5a,2023-09-16,15:33:04,1694889000.0,entertainment,92.39,fraud_Schuppe LLC,,
2,802-90-3870,30280512927668,Cynthia,Alexander,F,95041 Gary Locks,Champaign,IL,,40.13,...,adults_2550_female_urban.json,7f8e03dcf31fbbb5a3547ed8c40fa54b,2023-07-06,16:39:59,1688672000.0,shopping_net,5.32,"fraud_Little, Gutmann and Lynch",,
3,234-29-2150,4982150648900,Linda,Williams,F,001 Wallace Crossing,Calhan,CO,,38.96,...,adults_50up_female_urban.json,5f3cb674358918b82dbb5c12e60615ef,2023-04-10,02:45:30,1681106000.0,gas_transport,119.82,"fraud_Reilly, Heaney and Cole",,
4,802-90-3870,30280512927668,Cynthia,Alexander,F,95041 Gary Locks,Champaign,IL,,40.13,...,adults_2550_female_urban.json,a6537f4cfe7b5831fa481d520621b72d,2023-03-16,09:08:48,1678969000.0,gas_transport,41.59,fraud_Kling Inc,,


In [None]:
treino.head()

Unnamed: 0,ssn,cc_num,first,last,gender,street,city,state,zip,lat,...,trans_num,trans_date,trans_time,unix_time,category,amt,is_fraud,merchant,merch_lat,merch_long
0,176-11-8190,6591803397401489,Justin,Johnson,M,39261 Martin Garden Suite 856,Murrieta,CA,,33.57,...,556df8e74106df46d235d77659fb435f,2023-07-23,15:23:53,1690137000.0,misc_pos,2.27,0.0,fraud_Thiel PLC,,
1,481-92-6127,4772233060463333,Andrea,Ho,F,757 Ronald Trail Apt. 965,Ontario,CA,,34.08,...,c7567a1fbf9774505a8682283fc1c887,2023-12-13,22:13:50,1702516000.0,home,10.52,0.0,"fraud_Koss, Hansen and Lueilwitz",,
2,849-66-8722,30470551694093,John,Coffey,M,097 Alexandria Stravenue,Lancaster,CA,,34.69,...,7ac47082c43d97b1adfc78a8de84579b,2023-07-23,23:46:47,1690167000.0,misc_pos,149.01,0.0,fraud_Haley Group,,
3,392-97-2182,4189092753269739,Angelica,Garcia,F,656 Moody Gateway,Carson City,NV,,39.17,...,dcff610c572fcd800b8df0b5989e552e,2024-01-29,16:45:57,1706558000.0,personal_care,32.01,0.0,"fraud_Lubowitz, Terry and Stracke",,
4,742-37-0405,180088770435813,Travis,Jackson,M,133 Houston Ford,Rochester,NY,,43.17,...,3c6124136bfe76c8bd52bec6c398e868,2023-06-18,04:16:14,1687073000.0,misc_pos,137.31,0.0,fraud_McGlynn-Jaskolski,,


In [None]:
teste.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36047 entries, 0 to 36046
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ssn         36047 non-null  object 
 1   cc_num      36047 non-null  int64  
 2   first       36047 non-null  object 
 3   last        36047 non-null  object 
 4   gender      36047 non-null  object 
 5   street      36047 non-null  object 
 6   city        36047 non-null  object 
 7   state       36047 non-null  object 
 8   zip         370 non-null    float64
 9   lat         36047 non-null  float64
 10  long        36047 non-null  float64
 11  city_pop    36047 non-null  int64  
 12  job         36047 non-null  object 
 13  dob         36047 non-null  object 
 14  acct_num    36047 non-null  int64  
 15  profile     36047 non-null  object 
 16  trans_num   36047 non-null  object 
 17  trans_date  36047 non-null  object 
 18  trans_time  36047 non-null  object 
 19  unix_time   36047 non-nul

In [None]:
treino.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144352 entries, 0 to 144351
Data columns (total 26 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ssn         144352 non-null  object 
 1   cc_num      144352 non-null  int64  
 2   first       144352 non-null  object 
 3   last        144352 non-null  object 
 4   gender      144352 non-null  object 
 5   street      144352 non-null  object 
 6   city        144352 non-null  object 
 7   state       144352 non-null  object 
 8   zip         1468 non-null    float64
 9   lat         144352 non-null  float64
 10  long        144352 non-null  float64
 11  city_pop    144352 non-null  int64  
 12  job         144352 non-null  object 
 13  dob         144352 non-null  object 
 14  acct_num    144352 non-null  int64  
 15  profile     144352 non-null  object 
 16  trans_num   144194 non-null  object 
 17  trans_date  144194 non-null  object 
 18  trans_time  144194 non-null  object 
 19  un

# **Data Preprocessing**

In [None]:
#Removes irrelevant or sensitive columns
#drops merch_lat, merch_long and zip due to small non-null dataset relative to other columns (1468 vs 14194, around 10% of total)
#drops ssn, cc_num, first, last, street, acct_num and dob as it is related to one specif person
#drops trans_num, trans_date, trans_time, merchan, lat and long as they are irrelevant
#City and state will be eliminated as we will use population as the variable equivalent to the city and state

cols_para_remover = ['ssn', 'trans_date', 'trans_time', 'cc_num', 'merchant', 'first', 'last',
                     'street', 'city', 'state', 'zip', 'lat', 'long', 'dob', 'trans_num',
                    'merch_lat', 'merch_long', 'acct_num']
treino = treino.drop(cols_para_remover, axis=1)
treino.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144352 entries, 0 to 144351
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   gender     144352 non-null  object 
 1   city_pop   144352 non-null  int64  
 2   job        144352 non-null  object 
 3   profile    144352 non-null  object 
 4   unix_time  144194 non-null  float64
 5   category   144194 non-null  object 
 6   amt        144194 non-null  float64
 7   is_fraud   144194 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 8.8+ MB


In [None]:
#Remove NA values
treino = treino.dropna()
treino.head()

Unnamed: 0,gender,city_pop,job,profile,unix_time,category,amt,is_fraud
0,M,115971,Counselling psychologist,adults_2550_male_urban.json,1690137000.0,misc_pos,2.27,0.0
1,F,166856,"Nurse, mental health",adults_50up_female_urban.json,1702516000.0,home,10.52,0.0
2,M,182305,Water quality scientist,adults_2550_male_urban.json,1690167000.0,misc_pos,149.01,0.0
3,F,61818,Health and safety inspector,adults_50up_female_urban.json,1706558000.0,personal_care,32.01,0.0
4,M,478127,Soil scientist,adults_50up_male_urban.json,1687073000.0,misc_pos,137.31,0.0


In [None]:
#Counts the number of unique values in each column of the training data.
valores_unicos = treino.nunique()
print (valores_unicos)

gender            2
city_pop        182
job             171
profile          10
unix_time    143549
category         14
amt           24487
is_fraud          2
dtype: int64


In the chart below, we can see that there is an undersampling for is_fraud which will need to be adjusted when applying machine learning model

In [None]:
#Creates a histogram for gender distribution
import plotly.express as px
px.histogram(treino, x = 'gender', text_auto = True,
             color = 'is_fraud', barmode = 'group')

In [None]:
#box plot for the transaction amount
px.box(treino, x = 'amt', color = 'is_fraud')

## **Machine Learning preprocessing**

In [None]:
#Converts gender values from categorical ('F', 'M') to numerical (0, 1)
treino['gender'] = treino['gender'].map({'F': 0, 'M': 1})
treino.head()

Unnamed: 0,gender,city_pop,job,profile,unix_time,category,amt,is_fraud
0,1,115971,Counselling psychologist,adults_2550_male_urban.json,1690137000.0,misc_pos,2.27,0.0
1,0,166856,"Nurse, mental health",adults_50up_female_urban.json,1702516000.0,home,10.52,0.0
2,1,182305,Water quality scientist,adults_2550_male_urban.json,1690167000.0,misc_pos,149.01,0.0
3,0,61818,Health and safety inspector,adults_50up_female_urban.json,1706558000.0,personal_care,32.01,0.0
4,1,478127,Soil scientist,adults_50up_male_urban.json,1687073000.0,misc_pos,137.31,0.0


In [None]:
#Separating Features and Target
x = treino.drop('is_fraud', axis=1)
y = treino['is_fraud']


In [None]:
# Combines the unique categories for 'job', 'profile', and 'category' columns across both training and testing dataframes
categorias_x = set(x['job'].unique()) | set(x['profile'].unique()) | set(x['category'].unique())
categorias_teste = set(teste['job'].unique()) | set(teste['profile'].unique()) | set(teste['category'].unique())
categorias_x = [list(categorias_x) for _ in range(3)]  # Três colunas para x: 'job', 'profile' e 'category'
categorias_teste = [list(categorias_teste) for _ in range(3)]

In [None]:
#Transform categorical features into numerical using one-hot encoding.
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

colunas_x = x.columns #(keeps access to columns)
one_hot = make_column_transformer(
    (OneHotEncoder(categories=categorias_x, drop='if_binary'), ['job', 'profile', 'category']),
    remainder='passthrough',sparse_threshold=0)

x = one_hot.fit_transform(x)
x = pd.DataFrame(x, columns = one_hot.get_feature_names_out(colunas_x))
x.head()

Unnamed: 0,"onehotencoder__job_Scientist, product/process development",onehotencoder__job_Academic librarian,onehotencoder__job_Community arts worker,onehotencoder__job_Fisheries officer,onehotencoder__job_IT consultant,onehotencoder__job_Research scientist (maths),onehotencoder__job_Ranger/warden,"onehotencoder__job_Surveyor, insurance",onehotencoder__job_Graphic designer,"onehotencoder__job_Engineer, communications",...,onehotencoder__category_Ophthalmologist,onehotencoder__category_Historic buildings inspector/conservation officer,onehotencoder__category_Nutritional therapist,onehotencoder__category_Sports development officer,onehotencoder__category_Event organiser,onehotencoder__category_adults_2550_male_rural.json,remainder__gender,remainder__city_pop,remainder__unix_time,remainder__amt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,115971.0,1690137000.0,2.27
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,166856.0,1702516000.0,10.52
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,182305.0,1690167000.0,149.01
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61818.0,1706558000.0,32.01
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,478127.0,1687073000.0,137.31


In [None]:
#Standardize numerical features, ensuring they have zero mean and unit variance.
from sklearn.preprocessing import StandardScaler
import pandas as pd

colunas_para_escalar = ['remainder__city_pop', 'remainder__unix_time','remainder__amt']
dados_selecionados = x[colunas_para_escalar]
dados_selecionados = dados_selecionados.astype(float)

scaler = StandardScaler()
scaler.fit(dados_selecionados)
dados_transformados = scaler.transform(dados_selecionados)

x[colunas_para_escalar] = dados_transformados

x.head()

Unnamed: 0,"onehotencoder__job_Scientist, product/process development",onehotencoder__job_Academic librarian,onehotencoder__job_Community arts worker,onehotencoder__job_Fisheries officer,onehotencoder__job_IT consultant,onehotencoder__job_Research scientist (maths),onehotencoder__job_Ranger/warden,"onehotencoder__job_Surveyor, insurance",onehotencoder__job_Graphic designer,"onehotencoder__job_Engineer, communications",...,onehotencoder__category_Ophthalmologist,onehotencoder__category_Historic buildings inspector/conservation officer,onehotencoder__category_Nutritional therapist,onehotencoder__category_Sports development officer,onehotencoder__category_Event organiser,onehotencoder__category_adults_2550_male_rural.json,remainder__gender,remainder__city_pop,remainder__unix_time,remainder__amt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.356577,-0.444355,-0.401898
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.262386,0.965398,-0.353895
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.233789,-0.440919,0.451911
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.456818,1.425583,-0.228855
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.313795,-0.793276,0.383834


In [None]:
#Balancing the dataset; oversample the minority class (fraudulent transactions)
from imblearn.over_sampling import SMOTE
smt = SMOTE(random_state=123)
x, y = smt.fit_resample(x, y)

In [None]:
#Train-Test Split: Splits the dataset into training and testing sets (75% training, 25% testing)
#Ensure the same proportion of fraudulent transactions in both sets (stratification).
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25,
                                                    stratify = y, shuffle=True, random_state=42)


# **Machine Learning - apply models ( Model Training and Evaluation)**

In [None]:
#Dummy
from sklearn.dummy import DummyClassifier

dummy_stratified = DummyClassifier()
dummy_stratified.fit(X_train, y_train)

previsoes_dummy = dummy_stratified.predict(X_test)

acuracia_dummy = dummy_stratified.score(X_train, y_train)
precision_dummy = precision_score(y_test, previsoes_dummy, average='weighted')
print("A acurácia do dummy stratified foi %.2f%%" % (acuracia_dummy*100))
print("A precisao do dummy stratified foi %.2f%%" % (precision_dummy*100))

A acurácia do dummy stratified foi 50.00%
A precisao do dummy stratified foi 25.00%



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
#Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

modelo_dtc = DecisionTreeClassifier()

modelo_dtc.fit(X_train, y_train)

previsoes = modelo_dtc.predict(X_test)

acuracia_dtc = accuracy_score(y_test, previsoes)
precision_dtc = precision_score(y_test, previsoes, average='weighted')
print("Acurácia do modelo Decision Tree: %.2f%%" % (acuracia_dtc*100))
print("Precisao do modelo Decision Tree: %.2f%%" % (precision_dtc*100))

Acurácia do modelo Decision Tree: 99.75%
Precisao do modelo Decision Tree: 99.75%


In [None]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


modelo_rf = RandomForestClassifier()

modelo_rf.fit(X_train, y_train)

previsoes_rf = modelo_rf.predict(X_test)

acuracia_rf = accuracy_score(y_test, previsoes_rf)
precision_rf= precision_score(y_test, previsoes_rf, average='weighted')
print("Acurácia do modelo Random Forest Classifier: %.2f%%" % (acuracia_rf*100))
print("Precisao do modelo Random Forest Classifier: %.2f%%" % (precision_rf*100))

Acurácia do modelo Random Forest Classifier: 99.89%
Precisao do modelo Random Forest Classifier: 99.89%


In [None]:
#XGBClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

modelo_xgbc = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=3)

modelo_xgbc.fit(X_train, y_train)

previsoes_xgbc = modelo_xgbc.predict(X_test)


accuracy_xgbc = accuracy_score(y_test, previsoes_xgbc)
precision_xgbc = precision_score(y_test, previsoes_xgbc)
print("Acurácia do modelo XGBClassifier: %.2f%%" % (accuracy_xgbc*100))
print("Precisão do modelo XGBClassifier: %.2f%%" %(precision_xgbc*100))

Acurácia do modelo XGBClassifier: 96.26%
Precisão do modelo XGBClassifier: 98.32%


In [None]:
#Consolidate results and choose best model

ML = ['DecisionTreeClassifier' , 'DummyClassifier','RandomForest', 'XGBClassifier']
Precision = [precision_dtc, precision_dummy, precision_rf, precision_xgbc]

Resultados_ML=pd.DataFrame({'ML': ML, 'Accuracy':Accuracy, 'Precision':Precision})
Resultados_ML

Unnamed: 0,ML,Accuracy,Precision
0,DecisionTreeClassifier,0.997435,0.997491
1,DummyClassifier,0.500002,0.249993
2,RandomForest,0.998794,0.998851
3,XGBClassifier,0.962556,0.983188


-----BEST MODEL IS RANDOMFOREST-----

# **Preparing Test Data**

In [None]:
#Preparing dataset with final answer
resposta = pd.DataFrame()
resposta['trans_num'] =  teste['trans_num']

In [None]:
#Prepare test dataset changing gender and removing same columns as train dataset
teste['gender'] = teste['gender'].map({'F': 0, 'M': 1})
teste_nova = teste.drop(cols_para_remover, axis=1)
teste_nova.head()

Unnamed: 0,gender,city_pop,job,profile,unix_time,category,amt
0,0,927396,Lawyer,adults_50up_female_urban.json,1688900000.0,shopping_net,50.96
1,0,18943,Audiological scientist,adults_2550_female_urban.json,1694889000.0,entertainment,92.39
2,0,88746,Publishing copy,adults_2550_female_urban.json,1688672000.0,shopping_net,5.32
3,0,6351,Sports development officer,adults_50up_female_urban.json,1681106000.0,gas_transport,119.82
4,0,88746,Publishing copy,adults_2550_female_urban.json,1678969000.0,gas_transport,41.59


In [None]:
#Apply Onehot

teste_nova = one_hot.transform(teste_nova)
teste_nova = pd.DataFrame(teste_nova, columns=one_hot.get_feature_names_out(colunas_x))

In [None]:
# Apply StandardScaler
colunas_para_escalar = ['remainder__city_pop', 'remainder__unix_time','remainder__amt']


dados_selecionados = teste_nova[colunas_para_escalar]
dados_selecionados = dados_selecionados.astype(float)

scaler.fit(dados_selecionados)

dados_transformados = scaler.transform(dados_selecionados)
teste_nova[colunas_para_escalar] = dados_transformados

teste_nova.head()

Unnamed: 0,"onehotencoder__job_Scientist, product/process development",onehotencoder__job_Academic librarian,onehotencoder__job_Community arts worker,onehotencoder__job_Fisheries officer,onehotencoder__job_IT consultant,onehotencoder__job_Research scientist (maths),onehotencoder__job_Ranger/warden,"onehotencoder__job_Surveyor, insurance",onehotencoder__job_Graphic designer,"onehotencoder__job_Engineer, communications",...,onehotencoder__category_Ophthalmologist,onehotencoder__category_Historic buildings inspector/conservation officer,onehotencoder__category_Nutritional therapist,onehotencoder__category_Sports development officer,onehotencoder__category_Event organiser,onehotencoder__category_adults_2550_male_rural.json,remainder__gender,remainder__city_pop,remainder__unix_time,remainder__amt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.136673,-0.578071,-0.121705
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.536858,0.10177,0.106432
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.408268,-0.603907,-0.373025
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.560055,-1.462835,0.257478
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.408268,-1.705409,-0.173302


In [None]:
#Matching Column Names: Ensures that the test data has the same columns as the training data, as the model was trained on those specific columns.
colunas_tabela1 = set(X_train.columns)
colunas_tabela2 = set(teste_nova.columns)

colunas_faltantes = colunas_tabela1 - colunas_tabela2

print("Colunas da tabela1 que não estão na tabela2:")
print(colunas_faltantes)

Colunas da tabela1 que não estão na tabela2:
set()


# **Machine Learning on test dataset**

In [None]:
#Prediction on Test Data: using best model
previsoes_teste = modelo_rf.predict(teste_nova)
proba = modelo_rf.predict_proba(teste_nova)

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


# **Prepare and Submit file**

In [None]:
resposta['is_fraud'] = proba[:, 1]
resposta.head()

Unnamed: 0,trans_num,is_fraud
0,a4194096c6cc870b2e21b2b2f69a7706,0.0
1,f12645668a1a5a9f8192687f56095b5a,0.04
2,7f8e03dcf31fbbb5a3547ed8c40fa54b,0.0
3,5f3cb674358918b82dbb5c12e60615ef,0.01
4,a6537f4cfe7b5831fa481d520621b72d,0.0


In [None]:
resposta.to_csv('submission.csv',index=False)