In [1]:
# Importamos y iniciamos nuestro findspark que nos permitirá iniciar indicarle a nuestro sistema Spark que 
# agrege Pyspark a sys.path
import findspark
findspark.init()

In [2]:
# Iniciamos una sesion en los cluster de Spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TFM").getOrCreate()

In [32]:
# Importaremos todas las librerias necesarias para realizar una EDA y entrenar nuestro modelo
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
#Importamos los datos 
df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("data/creditcard.csv")
df_pandas = pd.read_csv("data/creditcard.csv")

In [5]:
# Monstramos las primeras cinco lineas 
df_pandas.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
df.groupBy("Class").count().show()

+-----+------+
|Class| count|
+-----+------+
|    1|   492|
|    0|284315|
+-----+------+



In [64]:
# Realizamos un undersampling de los datos con el fin de nivelar la variable objetivo
df_label_1 = df.where(col("Class")==1)
df_label_0 = df.where(col("Class")==0).sample(False,df_label_1.count()/df.count(),123)
df_final = df_label_1.union(df_label_0).orderBy(rand())

In [65]:
#Obtenemos las columnas que introduciremos en nuestro modelo y creamos nuestro VectorAssamble y nuestro Scaler
features_columns = [col for col in df_final.columns if col.startswith("V")]
vectorizer = VectorAssembler(inputCols= features_columns, outputCol="feature_vector")
scaler = StandardScaler(inputCol="feature_vector", outputCol="features")

In [66]:
df_train, df_test = df_final.randomSplit([0.8,0.2], seed=123)

In [67]:
# Declaramos nuestro modelo 
est = RandomForestClassifier()
est.setFeatureSubsetStrategy("auto")
est.setLabelCol("Class")
est.setSeed(42)

RandomForestClassifier_ee3fa05afd20

In [68]:
# Conectamos mediante una tuberia nuestro vectorizer, scaler y modelo 
pipeline = Pipeline()
pipeline.setStages([vectorizer,scaler,est])
#Entrenamos a nuestro algoritmo
model= pipeline.fit(df_train)

In [69]:
df_test_pred = model.transform(df_test)

In [70]:
y_true=df_test_pred.select(['Class']).collect()
y_pred = df_test_pred.select(['prediction']).collect()
print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95       101
           1       0.98      0.91      0.94        90

    accuracy                           0.95       191
   macro avg       0.95      0.95      0.95       191
weighted avg       0.95      0.95      0.95       191



In [None]:
# Comenzamos a tunear nuestro CrossValidator

In [71]:
paramGrid = ParamGridBuilder()\
            .addGrid(est.impurity,["entropy","gini"])\
            .addGrid(est.maxDepth,[2,5,8,10])\
            .build()
evaluator = BinaryClassificationEvaluator()\
            .setLabelCol("Class")\
            .setMetricName("areaUnderROC")
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

In [72]:
cvModel = crossval.fit(df_train)
df_test_pred = cvModel.transform(df_test)

In [73]:
y_true=df_test_pred.select(['Class']).collect()
y_pred = df_test_pred.select(['prediction']).collect()
print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       101
           1       1.00      0.92      0.96        90

    accuracy                           0.96       191
   macro avg       0.97      0.96      0.96       191
weighted avg       0.97      0.96      0.96       191



In [80]:
print(confusion_matrix(y_true,y_pred))

[[101   0]
 [  7  83]]


In [74]:
cvModel.bestModel.stages[2].extractParamMap()


{Param(parent='RandomForestClassifier_ee3fa05afd20', name='bootstrap', doc='Whether bootstrap samples are used when building trees.'): True,
 Param(parent='RandomForestClassifier_ee3fa05afd20', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,
 Param(parent='RandomForestClassifier_ee3fa05afd20', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,
 Param(parent='RandomForestClassifier_ee3fa05afd20', name='featureSubsetStrategy', doc="The number of features to consider for splits at each tree node. Supp

In [75]:
best_model= cvModel.bestModel
model.write().overwrite().save("credit-model")