In [2]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import SparkSession
from pyspark.ml.classification import LinearSVC
from pyspark.ml.functions import vector_to_array
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq
from pyspark.ml.feature import StandardScaler
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col
from math import *

In [3]:
spark = SparkSession.builder.master("local[1]")\
          .appName("sunshine_v2")\
          .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [3]:
def From_pyparquet_to_pandascsv(load_path, save_path) :
    df = pq.read_table(load_path).to_pandas()
    df.to_csv(save_path, index = False, sep = ";" )
    return os.path.isfile(save_path)

#### Loadin Yeo-Johnson transformed Data

In [4]:
df_transf = spark.read.option("header", True).option("delimiter", ";").csv("/Users/youssouf/Documents/simboxv2/transformed_var.csv")
df_transf.show(5)


+------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+--------------------+-------------+------------+-----------+----------+---------+----------+-----------+---------+----------+------+
|            valeur|       vol_tot_data|vol_tot_voix_offnet| vol_tot_voix_onet|       nbre_sms_out|  nbre_dist_sms_out|      nbre_voix_out|            dur_out| nbre_dist_voix_out|      nbre_cell_out|       nbre_sms_in|   nbre_dist_sms_in|        nbre_voix_in|              dur_in|   nbre_dist_voix_in|    nbre_sms_night| nbre_dist_sms_night|     nbre_voix_night|           dur_night|nbre_dist_voix_night|       msisdn|distributeur|statut_voix|statut_sms|statut_ff|statut_sva|statut_da

In [5]:
tranf_col_selected = ["valeur"  ,             "vol_tot_data"  ,       "vol_tot_voix_offnet",  "vol_tot_voix_onet"   ,
"nbre_sms_out"   ,      "nbre_dist_sms_out"  ,  "nbre_voix_out"   ,     "dur_out"    ,         
"nbre_dist_voix_out" ,  "nbre_cell_out"  ,      "nbre_sms_in"    ,      "nbre_dist_sms_in"   , 
"nbre_voix_in"  ,       "dur_in"     ,          "nbre_dist_voix_in" ,   "nbre_sms_night"    ,  
"nbre_dist_sms_night" , "nbre_voix_night"   ,   "dur_night"    ,        "nbre_dist_voix_night"]

In [6]:
df_transf.select("valeur","vol_tot_data","vol_tot_voix_offnet","vol_tot_voix_onet","nbre_sms_out").show(5)

+------------------+-------------------+-------------------+------------------+-------------------+
|            valeur|       vol_tot_data|vol_tot_voix_offnet| vol_tot_voix_onet|       nbre_sms_out|
+------------------+-------------------+-------------------+------------------+-------------------+
|0.6644666432044756|-1.2052775241458586| 0.5384651778295945|1.1105268224333997|-0.8861167378223557|
|0.6644666432044756|-1.2052775241458586| 0.5384651778295945|1.1105268224333997|  0.248944631389864|
|0.6644666432044756|-1.2052775241458586| 0.5384651778295945|1.1105268224333997|-0.8861167378223557|
|0.6644666432044756|-1.2052775241458586| 0.5384651778295945|1.1105268224333997|-0.8861167378223557|
|0.6644666432044756|-1.2052775241458586| 0.5384651778295945|1.1105268224333997|-0.8861167378223557|
+------------------+-------------------+-------------------+------------------+-------------------+
only showing top 5 rows



#### Yeo-Jonhson Function

In [115]:
def yeojohnson(x, lmbda) : 
    if (x >= 0, lmbda != 0) :
        return ((x + 1)**lmbda - 1) / lmbda
    elif (x >= 0, lmbda == 0) :
        return log(x + 1)
    elif (x < 0, lmbda != 2) :
        return -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda)
    elif (x < 0, lmbda == 2) : 
        return -log(-x + 1)

In [116]:
yeojohnson(2,3)

8.666666666666666

In [None]:
   valeur          vol_tot_data    vol_tot_voix_offnet vol_tot_voix_onet nbre_sms_out   
  0.2362504       0.09039554      0.1051099           0.1859025         -0.7436904     
     nbre_dist_sms_out nbre_voix_out   dur_out         nbre_dist_voix_out nbre_cell_out  
 -0.9938551        -1.231552       -0.4054642      -1.499216          -1.28494       
     nbre_sms_in     nbre_dist_sms_in nbre_voix_in    dur_in          nbre_dist_voix_in
 -1.245326       -1.08071         -4.999952       -4.99994        -4.99994         
     nbre_sms_night  nbre_dist_sms_night nbre_voix_night dur_night       nbre_dist_voix_night
 -4.99994        -4.99994            -4.999959       -4.99994        -4.999943 

In [104]:
yeojohnsonUDF = udf(lambda z,y: yeojohnson(z,y), DoubleType()) 

##### Loading original dataframe

In [117]:
df_origin = spark.read.option("header", True).option("delimiter", ";").csv("/Users/youssouf/Documents/simboxv2/simbox_v4_3.csv", sep = ";")

df_origin.show(5)


+-------------+------------+-----------+----------+---------+----------+-----------+---------+------+------------+-------------------+-----------------+----------+------------+-----------------+-------------+-------+------------------+-------------+-----------+----------------+------------+------+-----------------+--------------+-------------------+---------------+---------+--------------------+------+
|       msisdn|distributeur|statut_voix|statut_sms|statut_ff|statut_sva|statut_data|statut_om|valeur|vol_tot_data|vol_tot_voix_offnet|vol_tot_voix_onet|date_appel|nbre_sms_out|nbre_dist_sms_out|nbre_voix_out|dur_out|nbre_dist_voix_out|nbre_cell_out|nbre_sms_in|nbre_dist_sms_in|nbre_voix_in|dur_in|nbre_dist_voix_in|nbre_sms_night|nbre_dist_sms_night|nbre_voix_night|dur_night|nbre_dist_voix_night|statut|
+-------------+------------+-----------+----------+---------+----------+-----------+---------+------+------------+-------------------+-----------------+----------+------------+------------

In [118]:
df_origin = df_origin.replace('NULL',str(0))
df_origin = df_origin.replace('null' ,str(0))
df_origin = df_origin.na.fill(str(0))
df_origin = df_origin.fillna(str(0))


In [119]:
for cl in tranf_col_selected:
    df_origin = df_origin.withColumn(cl, col(cl).cast(DoubleType()))

#### Apply Yeo Johnson

In [120]:
df_origin1 = df_origin.withColumn("valeur", yeojohnsonUDF(col("valeur"), lit(0.2362504) )) 
df_origin1.select("valeur").show(5)


+-----------------+
|           valeur|
+-----------------+
|31.40339241303429|
|31.40339241303429|
|31.40339241303429|
|31.40339241303429|
|31.40339241303429|
+-----------------+
only showing top 5 rows



In [132]:
df_origin1.filter(col("valeur").isNull()).show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-104-bdc4338feb14>", line 1, in <lambda>
  File "<ipython-input-102-b634abc36d11>", line 2, in yeojohnson
TypeError: '>' not supported between instances of 'NoneType' and 'int'


In [121]:
df_origin.select("valeur").show(5)

+------+
|valeur|
+------+
|8249.0|
|8249.0|
|8249.0|
|8249.0|
|8249.0|
+------+
only showing top 5 rows



In [122]:
df_transf.select("valeur").show(5)

+------------------+
|            valeur|
+------------------+
|0.6644666432044756|
|0.6644666432044756|
|0.6644666432044756|
|0.6644666432044756|
|0.6644666432044756|
+------------------+
only showing top 5 rows



In [None]:
Array("statut_voix", "statut_sms", "statut_ff", "statut_sva", "statut_data", "statut_om") 


In [111]:
#df_origin1.count()

In [123]:
df = df_origin1.select("valeur")

In [124]:
df.createOrReplaceTempView("df1")

In [125]:
spark.sql("select distinct valeur from df1").show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-104-bdc4338feb14>", line 1, in <lambda>
  File "<ipython-input-102-b634abc36d11>", line 2, in yeojohnson
TypeError: '>' not supported between instances of 'NoneType' and 'int'


In [54]:
#df_origin1_tmp = df_origin1.select(
#    _mean(col("valeur")).alias("mean"),
#    _stddev(col("valeur")).alias("std") ).collect()


In [None]:
df_origin1.sel

In [53]:
df_origin1.select("valeur1").write.mode("overwrite").format("parquet").save("/Users/youssouf/Documents/simboxv2/valeur.parquet")




Py4JJavaError: An error occurred while calling o485.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:231)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:188)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 62.0 failed 1 times, most recent failure: Lost task 0.0 in stage 62.0 (TID 461) (nogon-bacula-fd.dev01.smile.lan executor driver): org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:296)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-46-bdc4338feb14>", line 1, in <lambda>
  File "<ipython-input-44-3d063221ec97>", line 2, in yeojohnson
TypeError: '>=' not supported between instances of 'NoneType' and 'int'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:84)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:67)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:277)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1473)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:286)
	... 9 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:200)
	... 32 more
Caused by: org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:296)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-46-bdc4338feb14>", line 1, in <lambda>
  File "<ipython-input-44-3d063221ec97>", line 2, in yeojohnson
TypeError: '>=' not supported between instances of 'NoneType' and 'int'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:84)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:67)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:277)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1473)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:286)
	... 9 more


In [52]:
df_valeur = spark.read.parquet("/Users/youssouf/Documents/simboxv2/valeur.parquet")
df_valeur.show(5)


+------+
|valeur|
+------+
|3900.0|
|3900.0|
|3900.0|
|3900.0|
|3900.0|
+------+
only showing top 5 rows



In [17]:
df_origin.select("valeur").printSchema()

root
 |-- valeur: double (nullable = true)



In [50]:
df_origin.select("valeur").describe().show()

+-------+------------------+
|summary|            valeur|
+-------+------------------+
|  count|           5317699|
|   mean| 6792.208909906333|
| stddev|12711.033593267775|
|    min|               0.0|
|    max|          567852.0|
+-------+------------------+



In [51]:
df_origin1.select("valeur1").describe().show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-46-bdc4338feb14>", line 1, in <lambda>
  File "<ipython-input-44-3d063221ec97>", line 2, in yeojohnson
TypeError: '>=' not supported between instances of 'NoneType' and 'int'


In [48]:

df_origin1 = df_origin1.withColumn("valeur", col("valeur").cast(DoubleType()))

df_origin1.select("valeur").describe().show()


PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-43-941b3e4d3161>", line 1, in <lambda>
  File "<ipython-input-7-3d063221ec97>", line 2, in yeojohnson
TypeError: '>=' not supported between instances of 'NoneType' and 'int'


In [40]:
df_origin.select("valeur").describe().show()

+-------+------------------+
|summary|            valeur|
+-------+------------------+
|  count|           5317699|
|   mean| 6792.208909906333|
| stddev|12711.033593267775|
|    min|               0.0|
|    max|          567852.0|
+-------+------------------+



In [27]:


mean = df_origin1_tmp[0]['mean']
std = df_origin1_tmp[0]['std']

print(mean)
print(std)

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-14-b04124869315>", line 1, in <lambda>
  File "<ipython-input-7-3d063221ec97>", line 2, in yeojohnson
TypeError: '>=' not supported between instances of 'NoneType' and 'int'


In [133]:
df = spark.read.\
       option("header","true").\
       option("delimiter",";").\
       csv("/Users/youssouf/Documents/simboxv2/transformed_var.csv")

In [134]:
inputcol = df.columns
inputcol.remove("date_appel")
inputcol.remove("statut")
inputcol.remove("msisdn")
inputcol.remove("distributeur")

In [135]:
inputcol

['valeur',
 'vol_tot_data',
 'vol_tot_voix_offnet',
 'vol_tot_voix_onet',
 'nbre_sms_out',
 'nbre_dist_sms_out',
 'nbre_voix_out',
 'dur_out',
 'nbre_dist_voix_out',
 'nbre_cell_out',
 'nbre_sms_in',
 'nbre_dist_sms_in',
 'nbre_voix_in',
 'dur_in',
 'nbre_dist_voix_in',
 'nbre_sms_night',
 'nbre_dist_sms_night',
 'nbre_voix_night',
 'dur_night',
 'nbre_dist_voix_night',
 'statut_voix',
 'statut_sms',
 'statut_ff',
 'statut_sva',
 'statut_data',
 'statut_om']

In [136]:
len(inputcol)

26

In [137]:
columns_to_scaled = list(("valeur"  , "vol_tot_data"  , "vol_tot_voix_offnet",  "vol_tot_voix_onet" ,
          "nbre_sms_out"   ,      "nbre_dist_sms_out"  ,  "nbre_voix_out"   ,     "dur_out"    ,
          "nbre_dist_voix_out" ,  "nbre_cell_out"  ,      "nbre_sms_in"    ,      "nbre_dist_sms_in"   ,
          "nbre_voix_in"  ,       "dur_in"     ,          "nbre_dist_voix_in" ,   "nbre_sms_night"    ,
          "nbre_dist_sms_night" , "nbre_voix_night"   ,   "dur_night"    ,        "nbre_dist_voix_night"))

In [138]:
len(columns_to_scaled)

20

In [139]:
columns_not_to_scaled = list(("statut_voix", "statut_sms", "statut_ff", "statut_sva", "statut_data", "statut_om")) 


In [143]:
vv = columns_not_to_scaled + columns_to_scaled

In [144]:
set(vv) - set(inputcol)

set()

In [145]:
set(inputcol) - set(vv)

set()

In [None]:
GBTClassifier()

In [165]:
from datetime import datetime


In [218]:
def check_fraudeur_date(path_df, date) :
    from datetime import datetime

    #path_df = "/Users/youssouf/Downloads/simbox_v4_prediction_" + model + "_" + date + ".parquet"
    #"/Users/youssouf/Documents/simboxv2/prediction_gbt_vbm_30_v4_dist.parquet"
    df = spark.read.parquet(path_df)
    df1 = df.filter(col("prediction") == 1)
      
    df_fraudeur_FRA = spark.read.option("header", True) \
                        .option("delimiter", ";") \
                        .csv("/Users/youssouf/Downloads/simbox_fra_2809_0610.csv")
    df_fraudeur_FRA = df_fraudeur_FRA.withColumn("msisdn", concat(lit("22507") , col("MSISDN")))

    d = datetime.strptime(date , '%Y%m%d')
    d = datetime.strftime(d, "%d/%m/%Y")
    
    df_fraudeur = df_fraudeur_FRA.filter(col("Date_DETECTION") == d)
    
    res = {"NOMBRE DE DETECTION" : df.filter(col("prediction") == 1).count() , 
           "NOMBRE DE FRA" : df_fraudeur.count() ,
           "NOMBRE COMMUN" : df1.join(df_fraudeur, ["msisdn"], "inner").count(), 
           "NOMBRE FRAU/NON-FRAU": df.groupBy("prediction").count().show() }

    return res

In [187]:
check_fraudeur_date("20210930", "gbt")

+----------+-------+
|prediction|  count|
+----------+-------+
|       0.0|4274250|
|       1.0|3267340|
+----------+-------+



{'NOMBRE DE DETECTION': 3267340,
 'NOMBRE DE FRA': 334,
 'NOMBRE COMMUN': 253,
 'NOMBRE FRAU/NON-FRAU': None}

In [219]:
check_fraudeur_date("/Users/youssouf/Documents/simboxv2/prediction_dt15_vbm_30_v4_dist.parquet", "20210930")

+----------+-------+
|prediction|  count|
+----------+-------+
|       0.0|4056343|
|       1.0|3485247|
+----------+-------+



{'NOMBRE DE DETECTION': 3485247,
 'NOMBRE DE FRA': 334,
 'NOMBRE COMMUN': 256,
 'NOMBRE FRAU/NON-FRAU': None}

In [220]:
df_dt = spark.read.parquet("/Users/youssouf/Documents/simboxv2/prediction_dt15_vbm_30_v4_dist.parquet")
df_dt.show(5)


+-------+------------+-----------+----------+---------+----------+-----------+---------+------+------------+-------------------+-----------------+----------+-------------+------------+-----------------+-------------+-------+------------------+-------------+-----------+----------------+------------+------+-----------------+--------------+-------------------+---------------+---------+--------------------+-----------------+----------------+--------------------+-------------+--------------------+----------+
|msisdn1|distributeur|statut_voix|statut_sms|statut_ff|statut_sva|statut_data|statut_om|valeur|vol_tot_data|vol_tot_voix_offnet|vol_tot_voix_onet|date_appel|       msisdn|nbre_sms_out|nbre_dist_sms_out|nbre_voix_out|dur_out|nbre_dist_voix_out|nbre_cell_out|nbre_sms_in|nbre_dist_sms_in|nbre_voix_in|dur_in|nbre_dist_voix_in|nbre_sms_night|nbre_dist_sms_night|nbre_voix_night|dur_night|nbre_dist_voix_night|distributeurIndex|distributeur_vec|            features|rawPrediction|         probab

In [221]:
df_gbt = spark.read.parquet("/Users/youssouf/Documents/simboxv2/prediction_gbt_vbm_30_v4_dist.parquet")
df_gbt.show(5)



+-------+------------+-----------+----------+---------+----------+-----------+---------+------+------------+-------------------+-----------------+----------+-------------+------------+-----------------+-------------+-------+------------------+-------------+-----------+----------------+------------+------+-----------------+--------------+-------------------+---------------+---------+--------------------+-----------------+----------------+--------------------+--------------------+--------------------+----------+
|msisdn1|distributeur|statut_voix|statut_sms|statut_ff|statut_sva|statut_data|statut_om|valeur|vol_tot_data|vol_tot_voix_offnet|vol_tot_voix_onet|date_appel|       msisdn|nbre_sms_out|nbre_dist_sms_out|nbre_voix_out|dur_out|nbre_dist_voix_out|nbre_cell_out|nbre_sms_in|nbre_dist_sms_in|nbre_voix_in|dur_in|nbre_dist_voix_in|nbre_sms_night|nbre_dist_sms_night|nbre_voix_night|dur_night|nbre_dist_voix_night|distributeurIndex|distributeur_vec|            features|       rawPrediction| 

In [225]:
df_gbt_dt = df_dt.filter(col("prediction") == 1) \
.join(df_gbt.filter(col("prediction") == 1) , ["msisdn"], "inner")



In [226]:
df_gbt_dt.join(df_fr1, ["msisdn"], "inner").count()


247

In [228]:
df_rf = spark.read.parquet("/Users/youssouf/Documents/simboxv2/prediction_rf200_vbm_30_v4.parquet")



In [231]:
df_rf.filter(col("prediction") == 1).join(df_fr1, ["msisdn"], "inner").count()

251

In [236]:
df_rf.filter(col("prediction") == 1).join(df_dt.filter(col("prediction") == 1) , ["msisdn"], "inner").count()


2803063

In [235]:
df_rf.filter(col("prediction") == 1).join(df_gbt.filter(col("prediction") == 1) , ["msisdn"], "inner").count()


2863241

In [239]:
df_all = df_rf.filter(col("prediction") == 1).join(df_gbt_dt , ["msisdn"], "inner")


In [240]:
df_all.join(df_fr1, ["msisdn"], "inner").count()


236

In [241]:
df_all.count()


1925090

In [None]:
DecisionTreeClassifier()

In [200]:
path_df = "/Users/youssouf/Documents/simboxv2/prediction_gbt_vbm_30_v4_dist.parquet"
df = spark.read.parquet(path_df)

In [201]:
df1 = df.filter(col("prediction") == 1)

In [202]:
df1.printSchema()

root
 |-- msisdn1: string (nullable = true)
 |-- distributeur: string (nullable = true)
 |-- statut_voix: double (nullable = true)
 |-- statut_sms: double (nullable = true)
 |-- statut_ff: double (nullable = true)
 |-- statut_sva: double (nullable = true)
 |-- statut_data: double (nullable = true)
 |-- statut_om: double (nullable = true)
 |-- valeur: double (nullable = true)
 |-- vol_tot_data: double (nullable = true)
 |-- vol_tot_voix_offnet: double (nullable = true)
 |-- vol_tot_voix_onet: double (nullable = true)
 |-- date_appel: string (nullable = true)
 |-- msisdn: string (nullable = true)
 |-- nbre_sms_out: double (nullable = true)
 |-- nbre_dist_sms_out: double (nullable = true)
 |-- nbre_voix_out: double (nullable = true)
 |-- dur_out: double (nullable = true)
 |-- nbre_dist_voix_out: double (nullable = true)
 |-- nbre_cell_out: double (nullable = true)
 |-- nbre_sms_in: double (nullable = true)
 |-- nbre_dist_sms_in: double (nullable = true)
 |-- nbre_voix_in: double (nullable

In [203]:
df_fraudeur_FRA = spark.read.option("header", True) \
                    .option("delimiter", ";") \
                    .csv("/Users/youssouf/Downloads/simbox_fra_2809_0610.csv")
df_fraudeur_FRA = df_fraudeur_FRA.withColumn("msisdn", concat(lit("22507") , col("MSISDN")))

df_fraudeur_FRA.show(5)

+--------------+-------------+
|Date_DETECTION|       msisdn|
+--------------+-------------+
|    28/09/2021|2250700004846|
|    28/09/2021|2250700048563|
|    28/09/2021|2250700058736|
|    28/09/2021|2250700058747|
|    28/09/2021|2250700058748|
+--------------+-------------+
only showing top 5 rows



In [204]:
df_fraudeur_FRA.groupBy("Date_DETECTION").count().show()

+--------------+-----+
|Date_DETECTION|count|
+--------------+-----+
|    28/09/2021|  300|
|    06/10/2021|  405|
|    01/10/2021|  334|
|    29/09/2021|  307|
|    02/10/2021|  254|
|    03/10/2021|  323|
|    30/09/2021|  334|
|    04/10/2021|  267|
|    05/10/2021|  348|
+--------------+-----+



In [205]:
d = datetime.strptime("20210928" , '%Y%m%d')
d = datetime.strftime(d, "%d/%m/%Y")
d

'28/09/2021'

In [206]:
df_fr1 = df_fraudeur_FRA.filter(df_fraudeur_FRA.Date_DETECTION == "30/09/2021")

In [207]:
df2 = df1.join(df_fr1 , ["msisdn"], "inner")

In [208]:
df2.select("prediction", "probability").show(df2.count() , truncate = False)

+----------+----------------------------------------+
|prediction|probability                             |
+----------+----------------------------------------+
|1.0       |[0.1017905527733635,0.8982094472266365] |
|1.0       |[0.12908716758382,0.87091283241618]     |
|1.0       |[0.1559831922168539,0.8440168077831461] |
|1.0       |[0.0842721048097634,0.9157278951902366] |
|1.0       |[0.08590329188406619,0.9140967081159338]|
|1.0       |[0.07407294589146789,0.9259270541085322]|
|1.0       |[0.12813782262401474,0.8718621773759853]|
|1.0       |[0.08590329188406619,0.9140967081159338]|
|1.0       |[0.0874069294224149,0.9125930705775851] |
|1.0       |[0.09036800669697691,0.9096319933030231]|
|1.0       |[0.08590329188406619,0.9140967081159338]|
|1.0       |[0.08590329188406619,0.9140967081159338]|
|1.0       |[0.13221415576963777,0.8677858442303622]|
|1.0       |[0.07472934979711836,0.9252706502028817]|
|1.0       |[0.2699455002463848,0.7300544997536151] |
|1.0       |[0.1322141557696

In [209]:
df2.printSchema()

root
 |-- msisdn: string (nullable = true)
 |-- msisdn1: string (nullable = true)
 |-- distributeur: string (nullable = true)
 |-- statut_voix: double (nullable = true)
 |-- statut_sms: double (nullable = true)
 |-- statut_ff: double (nullable = true)
 |-- statut_sva: double (nullable = true)
 |-- statut_data: double (nullable = true)
 |-- statut_om: double (nullable = true)
 |-- valeur: double (nullable = true)
 |-- vol_tot_data: double (nullable = true)
 |-- vol_tot_voix_offnet: double (nullable = true)
 |-- vol_tot_voix_onet: double (nullable = true)
 |-- date_appel: string (nullable = true)
 |-- nbre_sms_out: double (nullable = true)
 |-- nbre_dist_sms_out: double (nullable = true)
 |-- nbre_voix_out: double (nullable = true)
 |-- dur_out: double (nullable = true)
 |-- nbre_dist_voix_out: double (nullable = true)
 |-- nbre_cell_out: double (nullable = true)
 |-- nbre_sms_in: double (nullable = true)
 |-- nbre_dist_sms_in: double (nullable = true)
 |-- nbre_voix_in: double (nullable

In [210]:
df_fraudeur_FRA.printSchema()

root
 |-- Date_DETECTION: string (nullable = true)
 |-- msisdn: string (nullable = true)



In [211]:
df2.printSchema()

root
 |-- msisdn: string (nullable = true)
 |-- msisdn1: string (nullable = true)
 |-- distributeur: string (nullable = true)
 |-- statut_voix: double (nullable = true)
 |-- statut_sms: double (nullable = true)
 |-- statut_ff: double (nullable = true)
 |-- statut_sva: double (nullable = true)
 |-- statut_data: double (nullable = true)
 |-- statut_om: double (nullable = true)
 |-- valeur: double (nullable = true)
 |-- vol_tot_data: double (nullable = true)
 |-- vol_tot_voix_offnet: double (nullable = true)
 |-- vol_tot_voix_onet: double (nullable = true)
 |-- date_appel: string (nullable = true)
 |-- nbre_sms_out: double (nullable = true)
 |-- nbre_dist_sms_out: double (nullable = true)
 |-- nbre_voix_out: double (nullable = true)
 |-- dur_out: double (nullable = true)
 |-- nbre_dist_voix_out: double (nullable = true)
 |-- nbre_cell_out: double (nullable = true)
 |-- nbre_sms_in: double (nullable = true)
 |-- nbre_dist_sms_in: double (nullable = true)
 |-- nbre_voix_in: double (nullable

In [212]:
df2 = df2.withColumn("probability_split", vector_to_array("probability"))
df2 = df2.select([col("probability_split")[i] for i in range(2)] + df2.columns)




In [213]:
df2.select("probability_split[1]" , "probability_split[0]").describe().show()

+-------+--------------------+--------------------+
|summary|probability_split[1]|probability_split[0]|
+-------+--------------------+--------------------+
|  count|                 253|                 253|
|   mean|  0.8776584394428554| 0.12234156055714455|
| stddev|  0.0592901976247947|0.059290197624794684|
|    min|  0.5127735175599702| 0.04518375298367699|
|    max|  0.9548162470163231| 0.48722648244002975|
+-------+--------------------+--------------------+



In [216]:
df2.select("probability_split[1]").groupBy("probability_split[1]").count().show(200)

+--------------------+-----+
|probability_split[1]|count|
+--------------------+-----+
|  0.9196264852360015|    1|
|  0.9249786334191047|    2|
|  0.7300544997536151|    2|
|  0.9252502590431275|    1|
|   0.883026075654677|    1|
|  0.8440168077831461|    2|
|  0.9289514594905071|    1|
|  0.6221561220141434|    1|
|  0.9515699366619306|    1|
|  0.9140967081159338|   50|
|  0.9125930705775851|    2|
|  0.9101510914198443|    1|
|  0.9548162470163231|    1|
|  0.6126605285942781|    1|
|   0.849450366445327|    1|
|  0.9328510702159383|    2|
|  0.9169248514858506|    1|
|  0.9238194766305756|    1|
|  0.8714047338860668|    4|
|  0.8497196032538159|    1|
|  0.7063278984640012|    2|
|  0.7683721760848135|    1|
|  0.9014543005035976|    2|
|   0.916881976945262|    1|
|  0.6997661587512713|    1|
|  0.9184649930522285|    1|
|  0.8062557076457539|    7|
|  0.9108557003080267|    4|
|  0.9041308533940862|    1|
|  0.8645363011277055|    1|
|  0.7137400904380123|    1|
|  0.898209447

In [242]:
df_final = spark.read.\
       option("header", "true").\
       option("delimiter", ";").\
       csv("/Users/youssouf/Downloads/trafic_global_20210930.csv")

df_vbm = spark.read\
    .option("header", True)\
    .option("delimiter",";")\
    .csv("/Users/youssouf/Downloads/fraudeur_simbox_vbm_all.csv")

In [243]:
cols = [i.lower() for i in df_vbm.columns]
df_vbm = df_vbm.toDF(*(c for c in cols))


In [244]:
df_final = df_vbm.join(df_final, ["msisdn"] , "right_outer")


In [245]:
df_final = df_final.na.fill(str(0))
df_final = df_final.fillna(str(0))
df_final = df_final.replace('NULL',str(0))
df_final = df_final.replace('null' ,str(0))

df_final.show()
df_final.printSchema()

+-------------+--------------+-----------+----------+---------+----------+-----------+---------+------+------------+-------------------+-----------------+----------+------------+-----------------+-------------+-------+------------------+-------------+-----------+----------------+------------+------+-----------------+--------------+-------------------+---------------+---------+--------------------+
|       msisdn|  distributeur|statut_voix|statut_sms|statut_ff|statut_sva|statut_data|statut_om|valeur|vol_tot_data|vol_tot_voix_offnet|vol_tot_voix_onet|date_appel|nbre_sms_out|nbre_dist_sms_out|nbre_voix_out|dur_out|nbre_dist_voix_out|nbre_cell_out|nbre_sms_in|nbre_dist_sms_in|nbre_voix_in|dur_in|nbre_dist_voix_in|nbre_sms_night|nbre_dist_sms_night|nbre_voix_night|dur_night|nbre_dist_voix_night|
+-------------+--------------+-----------+----------+---------+----------+-----------+---------+------+------------+-------------------+-----------------+----------+------------+-----------------+--

In [250]:
df_final = df_final.withColumn("distributeur", when(col("distributeur") == "0", "SII").otherwise(df_final.distributeur))
#df_final.groupby("DISTRIBUTEUR").count().show()


cols = [i.lower() for i in df_final.columns]
df_final = df_final.toDF(*(c for c in cols))

df_final.show()
df_final.printSchema()


+-------------+--------------+-----------+----------+---------+----------+-----------+---------+------+------------+-------------------+-----------------+----------+------------+-----------------+-------------+-------+------------------+-------------+-----------+----------------+------------+------+-----------------+--------------+-------------------+---------------+---------+--------------------+
|       msisdn|  distributeur|statut_voix|statut_sms|statut_ff|statut_sva|statut_data|statut_om|valeur|vol_tot_data|vol_tot_voix_offnet|vol_tot_voix_onet|date_appel|nbre_sms_out|nbre_dist_sms_out|nbre_voix_out|dur_out|nbre_dist_voix_out|nbre_cell_out|nbre_sms_in|nbre_dist_sms_in|nbre_voix_in|dur_in|nbre_dist_voix_in|nbre_sms_night|nbre_dist_sms_night|nbre_voix_night|dur_night|nbre_dist_voix_night|
+-------------+--------------+-----------+----------+---------+----------+-----------+---------+------+------------+-------------------+-----------------+----------+------------+-----------------+--

In [251]:
df_final.write.format("parquet").mode("overwrite").save("/Users/youssouf/Documents/simboxv2/df_final_prediction_20210930.parquet")



In [252]:
def From_pyparquet_to_pandascsv(load_path, save_path) :
    df = pq.read_table(load_path).to_pandas()
    df.to_csv(save_path, index = False, sep = ";" )
    return os.path.isfile(save_path)

In [253]:
From_pyparquet_to_pandascsv("/Users/youssouf/Documents/simboxv2/df_final_prediction_20210930.parquet",
                           "/Users/youssouf/Documents/simboxv2/df_final_prediction_20210930.csv")

True