In [1]:
import pandas as pd

In [2]:
data_raw = pd.read_csv('Data/creditcard.csv')

In [4]:
data_raw.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [1]:
# Iniciamos pyspark y arrancamos SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TFM").getOrCreate()

In [82]:
#Importamos la librerias
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler 
from pyspark.sql.functions import *
from sklearn.metrics import classification_report, confusion_matrix
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel

In [3]:
# Importamos los datos
df = spark.read.csv("data/creditcard.csv", header=True, inferSchema = True)

In [13]:
# Realizamos un subsampling de los datos
df_label_1 = df.where(col("Class") == 1)
df_label_0 = df.where(col("Class") == 0).sample(False, df_label_1.count()/df.count(),123)
df_final = df_label_1.union(df_label_0).orderBy(rand())

In [32]:
# Agrupamos las columnas que usaremos para entrenar nuestro modelo
feature_columns = [col for col in df_final.columns if col.startswith("V")]
vectorizer = VectorAssembler(inputCols = feature_columns, outputCol="features")

In [36]:
# Hacemos la particiones de nuestros dataFrames en Train y Test
df_train, df_test = df_final.randomSplit([0.8, 0.2], seed = 123)

#### RAMDOMFORESTCLASSIFIER

In [37]:
from pyspark.ml.classification import RandomForestClassifier

In [47]:
# Declaramos nuestro algoritmo 
est = RandomForestClassifier()
est.setFeatureSubsetStrategy("auto")
est.setLabelCol("Class")
est.setSeed(42)

RandomForestClassifier_2da7f86fd666

In [48]:
# Conectamos mediante una tuberia nuestro vectorizer y nuestro algoritmo
pipeline = Pipeline()
pipeline.setStages([vectorizer, est])
# Entrenamos el algoritmo
model = pipeline.fit(df_train)

In [49]:
# Realizamos las predicciones
df_test_pred = model.transform(df_test)

In [50]:
# Creamos la Matrix de cunfusion 
y_true = df_test_pred.select(['Class']).collect()
y_pred = df_test_pred.select(['prediction']).collect()
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95       101
           1       0.98      0.91      0.94        90

    accuracy                           0.95       191
   macro avg       0.95      0.95      0.95       191
weighted avg       0.95      0.95      0.95       191



In [59]:
# Comenzamos a tunear el CrossValidator

paramGrid = ParamGridBuilder()\
            .addGrid(est.maxBins,[10,50,100])\
            .addGrid(est.impurity,["entropy","gini"])\
            .addGrid(est.maxDepth,[4,6,8])\
            .build()

evaluator = BinaryClassificationEvaluator()\
            .setLabelCol("Class")\
            .setMetricName("areaUnderROC")


crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)

In [60]:
# Entrenamos y realizamos las predicciones
cvModel = crossval.fit(df_train)
df_test_pred = cvModel.transform(df_test)

In [64]:
# Introducimos nuestras predicciones en la matrix de confusion y observamos una mejora de nuestras metricas. 
y_true = df_test_pred.select(['Class']).collect()
y_pred = df_test_pred.select(['prediction']).collect()
print(classification_report(y_true, y_pred))

In [65]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       101
           1       1.00      0.91      0.95        90

    accuracy                           0.96       191
   macro avg       0.96      0.96      0.96       191
weighted avg       0.96      0.96      0.96       191



In [80]:
# Guardamos el Modelo para exportarlo a nuestro archivo de Spark Streaming. 
model = cvModel.bestModel
model.write().overwrite().save("credit-model3")

TypeError: 'PipelineModel' object is not callable