In [1]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import SparkSession
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import logging



spark = SparkSession.builder.master("local[1]")\
          .appName("sunshine_v2")\
          .getOrCreate()


print("******************************************* LOADING DATAFRAME ********************************************")

df = spark.read.\
       option("header","true").\
       option("delimiter",";").\
       csv("/Users/youssouf/Documents/simboxv2/final_data_for_ml_new_model.csv")

df = df.withColumnRenamed("statut_simbox_x","statut_simbox")
print("******************************************* REPLACE NULL VALUES, PRINTSCHEAMA AND SHOW ********************************************")

df = df.replace('NULL',str(0))
df = df.replace('null' ,str(0))
df = df.na.fill(str(0))
df = df.fillna(str(0))

#df = df.drop(["_c0","ech"])

print("******************************************* COLUMN TRANSFORMATION ********************************************")
liste_columns = ['MSISDN0', 'DISTRIBUTEUR', 'STATUT_VOIX', 'STATUT_SMS', 'STATUT_FF',
       'STATUT_SVA', 'STATUT_DATA', 'STATUT_OM', 'VALEUR', 'VOL_TOT_DATA',
       'VOL_TOT_VOIX_OFFNET', 'VOL_TOT_VOIX_ONET', 'statut_simbox', 'msisdn13',
       'dur_in', 'nbre_tot_call_in', 'nbre_dist_call_in', 'nbr_IN_SMS',
       'nbr_IN_VOIX', 'dur_inter_out', 'nbre_call_inter_out',
       'nbre_dist_call_inter_out', 'nbr_INTER_VOIX', 'dur_night_out',
       'nbre_call_night_out', 'nbr_NIGHT_SMS', 'nbr_NIGHT_VOIX', 'dur_out',
       'nbre_tot_call_out', 'nbre_dist_call_out', 'dist_imei', 'nbr_OUT_SMS',
       'nbr_OUT_VOIX', 'statut_simbox_y']

columns_to_non_cast = ['MSISDN0', 'DISTRIBUTEUR','msisdn13']

columns_to_cast = list(set(liste_columns) - set(columns_to_non_cast))

for cl in columns_to_cast:
    df = df.withColumn(cl, col(cl).cast(DoubleType()))

print("******************************************* VECTORISATION ********************************************")

inputcol = df.columns
inputcol.remove("MSISDN0")
inputcol.remove("statut_simbox")
inputcol.remove("msisdn13")
inputcol.remove("statut_simbox_y")
inputcol.remove("DISTRIBUTEUR")


string_indexer = StringIndexer(inputCol = "DISTRIBUTEUR", outputCol = "DISTRIBUTEURIndex")
onehotencoder = OneHotEncoder(inputCol="DISTRIBUTEURIndex", outputCol="DISTRIBUTEUR_vec")
vector_assembler = VectorAssembler(inputCols=inputcol + ["DISTRIBUTEURIndex" ,"DISTRIBUTEUR_vec"] , outputCol="features").setHandleInvalid("keep")

print("******************************************* SPLIT DATA ********************************************")

train, test = df.randomSplit([0.8, 0.2], seed=12345)

print("******************************************* UNDERSAMPLING ********************************************")

def resample(base_features, ratio, class_field, base_class):
    pos = base_features.filter(col(class_field) == base_class)
    neg = base_features.filter(col(class_field) != base_class)
    total_pos = pos.count()
    total_neg = neg.count()
    fraction = float(total_pos * ratio) / float(total_neg)
    sampled = neg.sample(False, fraction)
    return sampled.union(pos)

train_under = resample(train, 1.5 , "statut_simbox", 1)

print("******************************************* TRAIN COUNT ********************************************")

print("******************************************* MODELS ********************************************")
def compute_model(method, label, train_data, test_data, name_model):
        skill = method

        train, test = train_data, test_data

        pipeline = Pipeline(stages=[string_indexer, onehotencoder, vector_assembler, skill])

        evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                                   labelCol=label)

        paramGrid = ParamGridBuilder.addGrid(self=ParamGridBuilder ,param = skill.numTrees,values =[200, 300 , 400, 500]).build()

        numFolds = 5

        crossval = CrossValidator(
             estimator=pipeline,
             estimatorParamMaps=paramGrid,
             evaluator=evaluator,
             numFolds=numFolds)

        cvModel = crossval.fit(train)

        cvModel.write().overwrite().save("/Users/youssouf/Documents/simboxv2/" + name_model)

        trainingSummary = cvModel.bestModel.summary

        predictions = cvModel.transform(test)

        results = predictions.select(['prediction', label])

        predictionAndLabels = results.rdd

        metrics = MulticlassMetrics(predictionAndLabels)

        cm = metrics.confusionMatrix().toArray()

        accuracy = (cm[0][0] + cm[1][1]) / cm.sum()

        precision = (cm[0][0]) / (cm[0][0] + cm[1][0])

        recall = (cm[0][0]) / (cm[0][0] + cm[0][1])

        f1score = 2*((precision*recall)/(precision+recall))

        AUC = evaluator.evaluate(predictions)

        result = {"Accuracy": accuracy, "precision": precision, "recall": recall, "AUC": AUC, "F1score": f1score}

        last_result = {"Metrics" : result , "Matrice de confusion" : cm, "trainingSummary" : trainingSummary }

        return last_result

print("******************************************* COMPUTE MODELS ********************************************")

print("******************************************* RANDOM FOREST ")

rf = RandomForestClassifier(labelCol="statut_simbox", featuresCol="features", numTrees=100, impurity='gini')
print("Random Forest", compute_model(rf, "statut_simbox", train_under , test, "rfmodel_model3_under_var_dexter_cv"))


******************************************* LOADING DATAFRAME ********************************************
******************************************* REPLACE NULL VALUES, PRINTSCHEAMA AND SHOW ********************************************
******************************************* COLUMN TRANSFORMATION ********************************************
******************************************* VECTORISATION ********************************************
******************************************* SPLIT DATA ********************************************
******************************************* UNDERSAMPLING ********************************************
******************************************* TRAIN COUNT ********************************************
******************************************* MODELS ********************************************
******************************************* COMPUTE MODELS ********************************************
*******************************************

AttributeError: type object 'ParamGridBuilder' has no attribute '_param_grid'