In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RealEstateDensityPriceAnalysis") \
    .getOrCreate()

data = spark.read.csv("dataset_with_neighbor.csv", header=True, inferSchema=True, sep=',')
data.printSchema()

root
 |-- id_mutation: string (nullable = true)
 |-- date_mutation: date (nullable = true)
 |-- numero_disposition: integer (nullable = true)
 |-- nature_mutation: string (nullable = true)
 |-- valeur_fonciere: double (nullable = true)
 |-- adresse_numero: integer (nullable = true)
 |-- adresse_suffixe: string (nullable = true)
 |-- adresse_nom_voie: string (nullable = true)
 |-- adresse_code_voie: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- code_commune: string (nullable = true)
 |-- nom_commune: string (nullable = true)
 |-- code_departement: string (nullable = true)
 |-- ancien_code_commune: integer (nullable = true)
 |-- ancien_nom_commune: string (nullable = true)
 |-- id_parcelle: string (nullable = true)
 |-- ancien_id_parcelle: string (nullable = true)
 |-- numero_volume: string (nullable = true)
 |-- lot1_numero: string (nullable = true)
 |-- lot1_surface_carrez: double (nullable = true)
 |-- lot2_numero: string (nullable = true)
 |-- lot2_surface

In [3]:
# Lire le CSV avec des options pour gérer les guillemets, les délimiteurs et les retours à la ligne
df_density = spark.read.option("header", "true") \
    .option("inferSchema", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .option("multiline", "true") \
    .option("sep", ",") \
    .csv("communes.csv")

# Afficher le schéma pour vérifier les colonnes
df_density.printSchema()

root
 |-- id: integer (nullable = true)
 |-- code_commune_INSEE: string (nullable = true)
 |-- nom_commune_postal: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- libelle_acheminement: string (nullable = true)
 |-- ligne_5: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- code_commune: integer (nullable = true)
 |-- article: string (nullable = true)
 |-- nom_commune: string (nullable = true)
 |-- nom_commune_complet: string (nullable = true)
 |-- code_departement: string (nullable = true)
 |-- presentation: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- surface: integer (nullable = true)
 |-- population: integer (nullable = true)



In [4]:
df_density.show(truncate=False)

+---+------------------+-----------------------+-----------+-----------------------+-------+-----------+----------+------------+-------+---------------------+-----------------------+----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
from pyspark.sql.functions import col

df_density = df_density.select("code_commune_INSEE", "population", "surface")

# Calculer la densité de population
df_density = df_density.withColumn(
    "density", 
    col("population") / (col("surface")/100)
)
df_density = df_density.select("code_commune_INSEE", "density")

df_density.show()

+------------------+------------------+
|code_commune_INSEE|           density|
+------------------+------------------+
|              1001| 49.77635782747604|
|              1002|27.856365614798694|
|              1004| 575.7230142566191|
|              1005|109.16458852867831|
|              1006|18.543046357615893|
|              1007| 83.06140611094631|
|              1008|147.10424710424712|
|              1009| 46.50499286733238|
|              1010| 37.38159675236807|
|              1011|24.419376244193764|
|              1012|15.154306771073237|
|              1013|10.177935943060497|
|              1014|141.48711449091675|
|              1015|28.748341441839894|
|              1016|26.225769669327253|
|              1017| 58.43989769820972|
|              1019|3.8737446197991394|
|              1021| 263.9492753623189|
|              1022|376.20481927710847|
|              1023| 16.27408993576017|
+------------------+------------------+
only showing top 20 rows



In [6]:
# Joindre le DataFrame principal avec le DataFrame de densité de population
print("Nombre de lignes avant jointure : ", data.count())
data_wd = data.join(df_density, data.code_commune == df_density.code_commune_INSEE, how="left")
print("Nombre de lignes après jointure : ", data_wd.count())

Nombre de lignes avant jointure :  206946
Nombre de lignes après jointure :  206946


In [7]:
# Afficher le schéma pour vérifier les colonnes
data_wd.printSchema()

root
 |-- id_mutation: string (nullable = true)
 |-- date_mutation: date (nullable = true)
 |-- numero_disposition: integer (nullable = true)
 |-- nature_mutation: string (nullable = true)
 |-- valeur_fonciere: double (nullable = true)
 |-- adresse_numero: integer (nullable = true)
 |-- adresse_suffixe: string (nullable = true)
 |-- adresse_nom_voie: string (nullable = true)
 |-- adresse_code_voie: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- code_commune: string (nullable = true)
 |-- nom_commune: string (nullable = true)
 |-- code_departement: string (nullable = true)
 |-- ancien_code_commune: integer (nullable = true)
 |-- ancien_nom_commune: string (nullable = true)
 |-- id_parcelle: string (nullable = true)
 |-- ancien_id_parcelle: string (nullable = true)
 |-- numero_volume: string (nullable = true)
 |-- lot1_numero: string (nullable = true)
 |-- lot1_surface_carrez: double (nullable = true)
 |-- lot2_numero: string (nullable = true)
 |-- lot2_surface

In [8]:
data_wd.show(truncate=False)

+-----------+-------------+------------------+---------------+---------------+--------------+---------------+--------------------------+-----------------+-----------+------------+-----------------------+----------------+-------------------+------------------+--------------+------------------+-------------+-----------+-------------------+-----------+-------------------+-----------+-------------------+-----------+-------------------+-----------+-------------------+-----------+---------------+-----------+-------------------+-------------------------+-------------------+--------------+----------------------------+-----------------------+---------------+---------+---------+--------------+-------------------------------+------------------+-------------------+------------------+------------------+---------------------+------------------+------------------+
|id_mutation|date_mutation|numero_disposition|nature_mutation|valeur_fonciere|adresse_numero|adresse_suffixe|adresse_nom_voie          |ad

In [9]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
import numpy as np


# Définir la fonction de perte personnalisée
def avg_error(y_test, y_pred):
    error = np.abs((y_test - y_pred) / y_test)
    return float(error)*100  # Assurez-vous de retourner un type flottant pour Spark

# Enregistrer la fonction en tant que UDF
custom_pred_udf = udf(avg_error, DoubleType())

# Appliquer la fonction UDF aux colonnes de DataFrame
data_with_error = data_wd.withColumn("error", custom_pred_udf(col("prix_m2"), col("avg_neighbor_price")))

# Afficher les résultats
data_with_error.show()

+-----------+-------------+------------------+---------------+---------------+--------------+---------------+--------------------+-----------------+-----------+------------+--------------------+----------------+-------------------+------------------+--------------+------------------+-------------+-----------+-------------------+-----------+-------------------+-----------+-------------------+-----------+-------------------+-----------+-------------------+-----------+---------------+-----------+-------------------+-------------------------+-------------------+--------------+----------------------------+-----------------------+---------------+---------+---------+--------------+-------------------------------+------------------+-------------------+------------------+------------------+--------------------+------------------+------------------+------------------+
|id_mutation|date_mutation|numero_disposition|nature_mutation|valeur_fonciere|adresse_numero|adresse_suffixe|    adresse_nom_voie

In [10]:
data_with_error.write.mode("overwrite").option("header", "true").option("inferSchema", "true").csv("dataset_with_density.csv")

In [11]:
spark.stop()