<a href="https://colab.research.google.com/github/guillermocadena/niche_of_fraudsters/blob/develop/nicho_de_fraudes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Nicho de Fraudes</h>

## Paso 1. Cargar librerías y módulos requerdidos


In [139]:
import numpy as np   # Biblioteca para manejar matrices y operaciones de matrices
import pandas as pd  # Biblioteca para manejar tablas de datos.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Visualization
import matplotlib.pyplot as plt

## Paso 2. Lectura de datos.

### 2.1 Imortación a un DataFrame

In [140]:
datos = pd.read_csv("https://raw.githubusercontent.com/guillermocadena/niche_of_fraudsters/refs/heads/main/FraudeCanastas.csv")

Validación de tipo de dato

In [141]:
type(datos)

<h2>Funcion para describir los datos de DataFrame</h2>

In [142]:
def describe_datos(df):
    unicos =[]
    for col in df:
        unicos.append(df[col].unique())
    unicos = pd.Series(unicos, index=df.columns)
    descripcion = pd.concat([df.dtypes, df.isna().sum(),df.nunique(),unicos], axis=1)
    descripcion.columns = ['dtypes', 'null','nunique','unique' ]
    return(descripcion)

In [143]:
describe_datos(datos)

Unnamed: 0,dtypes,null,nunique,unique
ID,int64,0,9319,"[130, 195, 217, 552, 854, 855, 856, 941, 1157,..."
APPLE PRODUCTDESCRIPTION | SAMSUNG | MODEL90,float64,0,2,"[0.0, 1000.0]"
AUDIO ACCESSORIES | AB AUDIO | AB AUDIO GO AIR TRUE WIRELESS BLUETOOTH IN-EAR H,float64,0,2,"[0.0, 20.0]"
AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE,float64,0,8,"[0.0, 125.0, 119.0, 120.0, 500.0, 129.0, 109.0..."
AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE 2ND GENERATI,float64,0,8,"[0.0, 109.0, 100.0, 105.0, 104.0, 99.0, 119.0,..."
...,...,...,...,...
costo_total,int64,0,1639,"[1299, 4119, 2806, 1206, 1807, 1263, 942, 1199..."
costo_medio_item,float64,0,2034,"[649.5, 1373.0, 1403.0, 603.0, 66.925925925925..."
costo_item_max,int64,0,540,"[1299, 2470, 2799, 1199, 195, 280, 938, 929, 1..."
costo_item_min,float64,0,528,"[0.0, 7.0, 4.0, 1249.0, 25.0, 2470.0, 999.0, 1..."


In [144]:
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9319 entries, 0 to 9318
Columns: 2457 entries, ID to fraud_flag
dtypes: float64(2452), int64(5)
memory usage: 174.7 MB


Tamaño del DataFrame

In [145]:
datos.shape

(9319, 2457)

Primeros 5 valores del DataFrame

In [146]:
datos.head()

Unnamed: 0,ID,APPLE PRODUCTDESCRIPTION | SAMSUNG | MODEL90,AUDIO ACCESSORIES | AB AUDIO | AB AUDIO GO AIR TRUE WIRELESS BLUETOOTH IN-EAR H,AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE,AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE 2ND GENERATI,AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH WIRELESS CHARGING CASE,AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH WIRELESS CHARGING CASE 2ND,AUDIO ACCESSORIES | APPLE | 2021 APPLE AIRPODS WITH MAGSAFE CHARGING CASE 3RD,AUDIO ACCESSORIES | APPLE | AIRPODS PRO,AUDIO ACCESSORIES | APPLE | APPLE AIRPODS MAX,...,WOMEN S NIGHTWEAR | ANYDAY RETAILER | ANYDAY RETAILER LEOPARD PRINT JERSEY PY,WOMEN S NIGHTWEAR | RETAILER | RETAILER CLEO VELOUR JOGGER LOUNGE PANT,WOMEN S NIGHTWEAR | SOSANDAR | SOSANDAR ZEBRA PRINT PYJAMA BOTTOMS BLACK 10,Nb_of_items,total_of_items,costo_total,costo_medio_item,costo_item_max,costo_item_min,fraud_flag
0,130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2,2,1299,649.5,1299,0.0,1.0
1,195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3,3,4119,1373.0,2470,0.0,1.0
2,217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2,2,2806,1403.0,2799,7.0,1.0
3,552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2,2,1206,603.0,1199,7.0,1.0
4,854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,19,27,1807,66.925926,195,4.0,1.0


1 - Fraudes </br>
0 - No Fraudes


Cantidades de filas con Fraude y sin Fraude

In [147]:
datos['fraud_flag'].value_counts()

Unnamed: 0_level_0,count
fraud_flag,Unnamed: 1_level_1
0.0,8000
1.0,1319


Validación de columnas numérias

In [148]:
numericas_cols = datos.select_dtypes('number').columns
numericas_cols

Index(['ID', 'APPLE PRODUCTDESCRIPTION | SAMSUNG | MODEL90',
       'AUDIO ACCESSORIES | AB AUDIO | AB AUDIO GO AIR TRUE WIRELESS BLUETOOTH IN-EAR H',
       'AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE',
       'AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE 2ND GENERATI',
       'AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH WIRELESS CHARGING CASE',
       'AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH WIRELESS CHARGING CASE 2ND',
       'AUDIO ACCESSORIES | APPLE | 2021 APPLE AIRPODS WITH MAGSAFE CHARGING CASE 3RD',
       'AUDIO ACCESSORIES | APPLE | AIRPODS PRO',
       'AUDIO ACCESSORIES | APPLE | APPLE AIRPODS MAX',
       ...
       'WOMEN S NIGHTWEAR | ANYDAY RETAILER | ANYDAY RETAILER LEOPARD PRINT JERSEY PY',
       'WOMEN S NIGHTWEAR | RETAILER | RETAILER CLEO VELOUR JOGGER LOUNGE PANT',
       'WOMEN S NIGHTWEAR | SOSANDAR | SOSANDAR ZEBRA PRINT PYJAMA BOTTOMS BLACK 10',
       'Nb_of_items', 'total_of_items', 'costo_tot

<h4>Comparación de 2 caracteristicas con respecto a los fraudes</h4>

In [149]:
pd.crosstab(datos['APPLE PRODUCTDESCRIPTION | SAMSUNG | MODEL90'], datos['fraud_flag'], normalize='index').round(2).sort_values(by=1,  ascending=False)

fraud_flag,0.0,1.0
APPLE PRODUCTDESCRIPTION | SAMSUNG | MODEL90,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.86,0.14
1000.0,1.0,0.0


In [150]:
pd.crosstab(datos['AUDIO ACCESSORIES | AB AUDIO | AB AUDIO GO AIR TRUE WIRELESS BLUETOOTH IN-EAR H'], datos['fraud_flag'], normalize='index').round(2).sort_values(by=1,  ascending=False)

fraud_flag,0.0,1.0
AUDIO ACCESSORIES | AB AUDIO | AB AUDIO GO AIR TRUE WIRELESS BLUETOOTH IN-EAR H,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.86,0.14
20.0,1.0,0.0


Validación de columnas únicas

In [169]:
unique_col = datos.nunique()
unique_col

Unnamed: 0,0
ID,9319
APPLE PRODUCTDESCRIPTION | SAMSUNG | MODEL90,2
AUDIO ACCESSORIES | AB AUDIO | AB AUDIO GO AIR TRUE WIRELESS BLUETOOTH IN-EAR H,2
AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE,8
AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE 2ND GENERATI,8
...,...
costo_total,1639
costo_medio_item,2034
costo_item_max,540
costo_item_min,528


In [171]:
unique_col.index

Index(['ID', 'APPLE PRODUCTDESCRIPTION | SAMSUNG | MODEL90',
       'AUDIO ACCESSORIES | AB AUDIO | AB AUDIO GO AIR TRUE WIRELESS BLUETOOTH IN-EAR H',
       'AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE',
       'AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE 2ND GENERATI',
       'AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH WIRELESS CHARGING CASE',
       'AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH WIRELESS CHARGING CASE 2ND',
       'AUDIO ACCESSORIES | APPLE | 2021 APPLE AIRPODS WITH MAGSAFE CHARGING CASE 3RD',
       'AUDIO ACCESSORIES | APPLE | AIRPODS PRO',
       'AUDIO ACCESSORIES | APPLE | APPLE AIRPODS MAX',
       ...
       'WOMEN S NIGHTWEAR | ANYDAY RETAILER | ANYDAY RETAILER LEOPARD PRINT JERSEY PY',
       'WOMEN S NIGHTWEAR | RETAILER | RETAILER CLEO VELOUR JOGGER LOUNGE PANT',
       'WOMEN S NIGHTWEAR | SOSANDAR | SOSANDAR ZEBRA PRINT PYJAMA BOTTOMS BLACK 10',
       'Nb_of_items', 'total_of_items', 'costo_tot

Establecimiento de columna correspondiente a mis etiquetas

In [152]:
y = datos['fraud_flag']
y

Unnamed: 0,fraud_flag
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
9314,0.0
9315,0.0
9316,0.0
9317,0.0


In [172]:
vectorizer = TfidfVectorizer()
BOW = vectorizer.fit_transform(datos.select_dtypes('number').columns)
BOW.shape

(2457, 3398)

In [160]:
print(BOW)

  (0, 776)	1.0
  (1, 145)	0.6064919236546327
  (1, 1259)	0.795089646858581
  (2, 161)	0.6996191911645517
  (2, 109)	0.12911903497781593
  (2, 684)	0.287250358649955
  (2, 120)	0.26381693117570865
  (2, 1517)	0.28304130234446945
  (2, 1617)	0.2345247536358639
  (2, 228)	0.2345247536358639
  (2, 781)	0.22493882350390823
  (2, 523)	0.3178608927708131
  (3, 145)	0.4079273702153572
  (3, 161)	0.33683673564735234
  (3, 109)	0.18649588862536398
  (3, 34)	0.40881686582835264
  (3, 122)	0.3981611768366422
  (3, 1619)	0.2740396182650299
  (3, 345)	0.40881686582835264
  (3, 317)	0.34272884760877165
  (4, 145)	0.37678473319141925
  (4, 161)	0.31112141238023333
  (4, 109)	0.17225812428302464
  (4, 34)	0.3776063215223039
  (4, 122)	0.36776412600697783
  :	:
  (2446, 1298)	0.40098772587500514
  (2446, 778)	0.34362680022099584
  (2446, 1289)	0.40098772587500514
  (2446, 628)	0.34362680022099584
  (2446, 244)	0.42944623213280664
  (2446, 1622)	0.3030560477845829
  (2447, 1221)	0.22975071495566007
  (24

In [173]:
palabras = vectorizer.get_feature_names_out()
palabras

array(['01', '02', '03', ..., 'zola', 'zoom', 'zpod'], dtype=object)

In [183]:
X = datos.drop('fraud_flag', axis=1)

In [184]:
X

Unnamed: 0,ID,APPLE PRODUCTDESCRIPTION | SAMSUNG | MODEL90,AUDIO ACCESSORIES | AB AUDIO | AB AUDIO GO AIR TRUE WIRELESS BLUETOOTH IN-EAR H,AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE,AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH CHARGING CASE 2ND GENERATI,AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH WIRELESS CHARGING CASE,AUDIO ACCESSORIES | APPLE | 2019 APPLE AIRPODS WITH WIRELESS CHARGING CASE 2ND,AUDIO ACCESSORIES | APPLE | 2021 APPLE AIRPODS WITH MAGSAFE CHARGING CASE 3RD,AUDIO ACCESSORIES | APPLE | AIRPODS PRO,AUDIO ACCESSORIES | APPLE | APPLE AIRPODS MAX,...,WOMEN S FOOTWEAR | UGG | UGG CLASSIC MINI II SHORT SHEEPSKIN BOOTS CHESTNUT,WOMEN S NIGHTWEAR | ANYDAY RETAILER | ANYDAY RETAILER LEOPARD PRINT JERSEY PY,WOMEN S NIGHTWEAR | RETAILER | RETAILER CLEO VELOUR JOGGER LOUNGE PANT,WOMEN S NIGHTWEAR | SOSANDAR | SOSANDAR ZEBRA PRINT PYJAMA BOTTOMS BLACK 10,Nb_of_items,total_of_items,costo_total,costo_medio_item,costo_item_max,costo_item_min
0,130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2,2,1299,649.500000,1299,0.0
1,195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3,3,4119,1373.000000,2470,0.0
2,217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2,2,2806,1403.000000,2799,7.0
3,552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2,2,1206,603.000000,1199,7.0
4,854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,19,27,1807,66.925926,195,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9314,110381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,1,369,369.000000,369,369.0
9315,111244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,16,20,2667,133.350000,423,15.0
9316,44253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,1,849,849.000000,849,849.0
9317,87960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2,2,1906,953.000000,1899,7.0


In [185]:
X.shape

(9319, 2456)

Segmentación de conjunes de entrenamiento y test

In [186]:
# Dividir datos en entrenamiento y prueba
X_train, X_test , y_train, y_test = train_test_split(X, y,
                                                     train_size=0.9,
                                                     shuffle=True,
                                                     random_state=9
                                                     )

Entrenamiento

In [187]:
clasificador_rl = LogisticRegression(penalty= 'l2',
                                     random_state=42,
                                     solver="newton-cg").fit(X_train, y_train)

