In [66]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, ClusteringEvaluator

from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import when, col, mean

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline


spark.sparkContext.setLogLevel("WARN")
spark = SparkSession.builder \
    .appName("Analysis") \
    .getOrCreate()

### 1 Ирисы Фишера

In [29]:
df = spark.read.csv("/opt/data/iris/iris.csv", header=True, inferSchema=True)
df.show(5)
print("Schema:")
df.printSchema()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows

Schema:
root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



In [30]:
indexer = StringIndexer(inputCol="Species", outputCol="label")
df_indexed = indexer.fit(df).transform(df)

print("Data after StringIndexer:")
df_indexed.select("Species", "label").distinct().show()

Data after StringIndexer:
+---------------+-----+
|        Species|label|
+---------------+-----+
|    Iris-setosa|  0.0|
| Iris-virginica|  2.0|
|Iris-versicolor|  1.0|
+---------------+-----+



In [31]:
assembler = VectorAssembler(
    inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"],
    outputCol="features"
)
data = assembler.transform(df_indexed)

# Разделение данных
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)
print(f"Train count: {train_data.count()}, Test count: {test_data.count()}")

Train count: 104, Test count: 46


In [32]:
lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_data)
predictions = lr_model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
print(f"Logistic Regression - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")

predictions.select("Species", "label", "prediction").show(10)

Logistic Regression - Accuracy: 0.9348, F1: 0.9335
+-----------+-----+----------+
|    Species|label|prediction|
+-----------+-----+----------+
|Iris-setosa|  0.0|       0.0|
|Iris-setosa|  0.0|       0.0|
|Iris-setosa|  0.0|       0.0|
|Iris-setosa|  0.0|       0.0|
|Iris-setosa|  0.0|       0.0|
|Iris-setosa|  0.0|       0.0|
|Iris-setosa|  0.0|       0.0|
|Iris-setosa|  0.0|       0.0|
|Iris-setosa|  0.0|       0.0|
|Iris-setosa|  0.0|       0.0|
+-----------+-----+----------+
only showing top 10 rows



In [33]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
dt_model = dt.fit(train_data)
dt_predictions = dt_model.transform(test_data)

dt_accuracy = evaluator.evaluate(dt_predictions, {evaluator.metricName: "accuracy"})
dt_f1 = evaluator.evaluate(dt_predictions, {evaluator.metricName: "f1"})
print(f"Decision Tree - Accuracy: {dt_accuracy:.4f}, F1: {dt_f1:.4f}")

Decision Tree - Accuracy: 0.9348, F1: 0.9335


In [34]:
kmeans_data = assembler.transform(df)

kmeans = KMeans(featuresCol="features", k=3, seed=42)
kmeans_model = kmeans.fit(kmeans_data)
kmeans_results = kmeans_model.transform(kmeans_data)

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(kmeans_results)
print(f"K-means Silhouette score: {silhouette:.4f}")

K-means Silhouette score: 0.7342


#### 2 Public bike use data 2014-2024 (MiBici)

In [72]:
df = spark.read.csv("/opt/data/bike", header=True, inferSchema=True)
df.show(5)
print("Schema:")
df.printSchema()

+---+--------------------+-------+----------------+----------+-----------+----------+
| id|                name|   obcn|        location|  latitude|  longitude|    status|
+---+--------------------+-------+----------------+----------+-----------+----------+
|  2|(GDL-001) C. Epig...|GDL-001|POLÍGONO CENTRAL| 20.666378| -103.34882|IN_SERVICE|
|  3|(GDL-002) C. Colo...|GDL-002|POLÍGONO CENTRAL| 20.667228|   -103.366|IN_SERVICE|
|  4|(GDL-003) C. Vidr...|GDL-003|POLÍGONO CENTRAL|  20.66769|-103.368252|IN_SERVICE|
|  5|(GDL-004) C. Ghil...|GDL-004|POLÍGONO CENTRAL|  20.69175| -103.36255|IN_SERVICE|
|  6|(GDL-005) C. San ...|GDL-005|POLÍGONO CENTRAL|20.6811575|-103.339363|IN_SERVICE|
+---+--------------------+-------+----------------+----------+-----------+----------+
only showing top 5 rows

Schema:
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- obcn: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 

In [74]:
print("Пропущенные значения:")
for column in df.columns:
    null_count = df.filter(col(column).isNull()).count()
    print(f"{column}: {null_count}")

Пропущенные значения:
id: 0
name: 0
obcn: 0
location: 0
latitude: 0
longitude: 0
status: 0


In [77]:
assembler = VectorAssembler(
    inputCols=["latitude", "longitude"],
    outputCol="features"
)
cluster_data = assembler.transform(df)

kmeans = KMeans(featuresCol="features", k=5, seed=42)
kmeans_model = kmeans.fit(cluster_data)
kmeans_results = kmeans_model.transform(cluster_data)

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(kmeans_results)

print("Географическая кластеризация станций:")
print(f"Silhouette Score: {silhouette:.4f}")

# Анализ кластеров
print("\nХарактеристики географических кластеров:")
kmeans_results.groupBy("prediction").agg(
    mean("latitude").alias("avg_latitude"),
    mean("longitude").alias("avg_longitude"),
    count("*").alias("station_count")
).orderBy("prediction").show()

Географическая кластеризация станций:
Silhouette Score: 0.6428

Характеристики географических кластеров:
+----------+------------------+-------------------+-------------+
|prediction|      avg_latitude|      avg_longitude|station_count|
+----------+------------------+-------------------+-------------+
|         0| 20.66331470894737|-103.40242464210525|           57|
|         1|20.678589033043483|-103.36811395478264|          115|
|         2|20.649100779999998|-103.31597733809522|           42|
|         3|20.726770622857153|-103.38809392380952|           42|
|         4|  20.6826440437931|-103.34422569310341|          116|
+----------+------------------+-------------------+-------------+



In [78]:
indexer_status = StringIndexer(inputCol="status", outputCol="label")
indexer_location = StringIndexer(inputCol="location", outputCol="location_index")

assembler_class = VectorAssembler(
    inputCols=["latitude", "longitude", "location_index"],
    outputCol="features"
)

rf = RandomForestClassifier(featuresCol="features", labelCol="label", 
                           numTrees=50, seed=42)

pipeline = Pipeline(stages=[indexer_status, indexer_location, assembler_class, rf])

train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

model = pipeline.fit(train_data)
predictions = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                             predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print("Классификация статуса станции:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

Классификация статуса станции:
Accuracy: 0.9462
F1 Score: 0.9201


In [82]:
df_with_distance = df.withColumn(
    "distance_from_center",
    sqrt(pow(col("latitude") - lit(20.666378), 2) + 
         pow(col("longitude") - lit(-103.34882), 2))
)

assembler_gmm = VectorAssembler(
    inputCols=["latitude", "longitude", "distance_from_center"],
    outputCol="raw_features"
)
df_gmm = assembler_gmm.transform(df_with_distance)

scaler = StandardScaler(inputCol="raw_features", outputCol="features", 
                       withStd=True, withMean=True)
scaler_model = scaler.fit(df_gmm)
df_scaled = scaler_model.transform(df_gmm)

gmm = GaussianMixture(featuresCol="features", k=3, seed=42)
gmm_model = gmm.fit(df_scaled)
gmm_results = gmm_model.transform(df_scaled)

gmm_summary = gmm_model.summary
print("GMM кластеризация для анализа плотности:")
print(f"Log-Likelihood: {gmm_summary.logLikelihood:.4f}")

print("\nПлотность станций по кластерам:")
gmm_results.groupBy("prediction").agg(
    count("*").alias("station_count"),
    mean("distance_from_center").alias("avg_distance_from_center"),
    mean("latitude").alias("avg_latitude"),
    mean("longitude").alias("avg_longitude")
).orderBy("prediction").show()

print("Распределение локаций по кластерам GMM:")
gmm_results.groupBy("prediction", "location").count() \
    .orderBy("prediction", "count", ascending=[True, False]) \
    .show()

GMM кластеризация для анализа плотности:
Log-Likelihood: -705.7909

Плотность станций по кластерам:
+----------+-------------+------------------------+------------------+-------------------+
|prediction|station_count|avg_distance_from_center|      avg_latitude|      avg_longitude|
+----------+-------------+------------------------+------------------+-------------------+
|         0|          144|    0.022689990010803622|20.685812681250002|-103.35147230763887|
|         1|           95|     0.05068054741645819|20.685846795789477|-103.34959180105265|
|         2|          133|     0.03525897213494478| 20.66847754518796|-103.38307706315791|
+----------+-------------+------------------------+------------------+-------------------+

Распределение локаций по кластерам GMM:
+----------+-----------------+-----+
|prediction|         location|count|
+----------+-----------------+-----+
|         0| POLÍGONO CENTRAL|  137|
|         0|TLQ-CORREDORATLAS|    4|
|         0|   ZAPOPAN CENTRO|    3|


#### 3 All Upwork Job Postings - Monthly Tracker

In [35]:
df = spark.read.csv("/opt/data/work", header=True, inferSchema=True)
df.show(5)
print("Schema:")
df.printSchema()

+--------------------+--------------------+--------------------+---------+----------+-----------+------+-------------+
|               title|                link|      published_date|is_hourly|hourly_low|hourly_high|budget|      country|
+--------------------+--------------------+--------------------+---------+----------+-----------+------+-------------+
|Experienced Media...|https://www.upwor...|2024-02-17 09:09:...|    False|      NULL|       NULL| 500.0|         NULL|
|Full Stack Developer|https://www.upwor...|2024-02-17 09:09:...|    False|      NULL|       NULL|1100.0|United States|
|     SMMA Bubble App|https://www.upwor...|2024-02-17 09:08:...|     True|      10.0|       30.0|  NULL|United States|
|Talent Hunter Spe...|https://www.upwor...|2024-02-17 09:08:...|     True|      NULL|       NULL|  NULL|United States|
|       Data Engineer|https://www.upwor...|2024-02-17 09:07:...|    False|      NULL|       NULL| 650.0|        India|
+--------------------+--------------------+-----

In [None]:
print("Пропущенные значения:")
for column in df.columns:
    null_count = df.filter(col(column).isNull()).count()
    print(f"{column}: {null_count}")

In [None]:
df_cleaned = df.dropna()

In [None]:
assembler = VectorAssembler(
    inputCols=["budget_clean", "hourly_rate", "payment_type"],
    outputCol="features",
    handleInvalid="keep"
)
cluster_data = assembler.transform(df_cleaned)

kmeans = KMeans(featuresCol="features", k=3, seed=42)
kmeans_model = kmeans.fit(cluster_data)
kmeans_results = kmeans_model.transform(cluster_data)

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(kmeans_results)

print("Кластеризация вакансий по бюджету и типу оплаты:")
print(f"Silhouette Score: {silhouette:.4f}")

                                                                                

Кластеризация вакансий по бюджету и типу оплаты:
Silhouette Score: 0.9998
+----------+------------------+------------------+-------------------+------+
|prediction|        avg_budget|   avg_hourly_rate|       hourly_ratio| count|
+----------+------------------+------------------+-------------------+------+
|         0| 269.7749814074697|3.5802810344097855|0.17042124929711167|826945|
|         1| 993903.8072289156|               0.0|                0.0|    83|
|         2|410939.28571428574|               0.0|                0.0|   112|
+----------+------------------+------------------+-------------------+------+



In [None]:
df_class = df.withColumn("is_hourly_bool", 
                        when(col("is_hourly") == "True", 1).otherwise(0)) \
             .filter(col("title").isNotNull() & 
                    col("is_hourly_bool").isNotNull())

tokenizer = Tokenizer(inputCol="title", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=1000)
idf = IDF(inputCol="raw_features", outputCol="features")

lr = LogisticRegression(featuresCol="features", labelCol="is_hourly_bool")

pipeline = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf, lr])

train_data, test_data = df_class.randomSplit([0.7, 0.3], seed=42)

model = pipeline.fit(train_data)
predictions = model.transform(test_data)

evaluator = BinaryClassificationEvaluator(labelCol="is_hourly_bool")
auc = evaluator.evaluate(predictions)

evaluator_f1 = BinaryClassificationEvaluator(labelCol="is_hourly_bool", metricName="areaUnderPR")
auc_pr = evaluator_f1.evaluate(predictions)

print("Классификация типа оплаты по названию:")
print(f"AUC: {auc:.4f}")
print(f"Area Under PR: {auc_pr:.4f}")

                                                                                

Классификация типа оплаты по названию:
AUC: 0.5784
Area Under PR: 0.2186
Accuracy: 0.8304


                                                                                

In [None]:
skills_keywords = ["developer", "engineer", "designer", "writer", "marketing", 
                   "data", "analysis", "python", "java", "javascript", "react",
                   "sql", "aws", "azure", "web", "mobile", "app", "software"]

df_skills = df.withColumn("clean_title", 
                         lower(regexp_replace(col("title"), "[^a-zA-Z\\s]", ""))) \
              .filter(col("clean_title").isNotNull())

for skill in skills_keywords:
    df_skills = df_skills.withColumn(f"has_{skill}", 
                                    when(col("clean_title").contains(skill), 1).otherwise(0))

feature_cols = [f"has_{skill}" for skill in skills_keywords]
assembler = VectorAssembler(
    inputCols=feature_cols, 
    outputCol="skills_features",
    handleInvalid="keep"
)

df_country = df_skills.filter(col("country").isNotNull() & 
                             (col("country") != "NULL") &
                             (col("country") != ""))

indexer = StringIndexer(inputCol="country", outputCol="country_label")
df_indexed = indexer.fit(df_country).transform(df_country)

skills_data = assembler.transform(df_indexed)

train_data, test_data = skills_data.randomSplit([0.7, 0.3], seed=42)

nb = NaiveBayes(featuresCol="skills_features", labelCol="country_label", smoothing=1.0)
nb_model = nb.fit(train_data)
nb_predictions = nb_model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="country_label", predictionCol="prediction")
accuracy = evaluator.evaluate(nb_predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(nb_predictions, {evaluator.metricName: "f1"})

print("Предсказание страны по навыкам:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")



Предсказание страны по навыкам:
Accuracy: 0.4213
F1 Score: 0.2497


                                                                                

#### 4 European Soccer Database

#### 5 Used Cars price dataset

In [51]:
df = spark.read.csv("/opt/data/cars", header=True, inferSchema=True)
df.show(5)
print("Schema:")
df.printSchema()

+---------+--------+----+------+-------+---------+---------+--------------------+---------+------+---------+----------------+
|     Make|   Model|Year| Price|Mileage|Body Type|Cylinders|        Transmission|Fuel Type| Color| Location|       Condition|
+---------+--------+----+------+-------+---------+---------+--------------------+---------+------+---------+----------------+
|      kia| sorento|2013| 61250| 169543|      SUV|        4|Automatic Transmi...| Gasoline|  Grey|Abu Dhabi|Accident history|
| cadillac|     srx|2024| 30047|  51876|      SUV|        6|Automatic Transmi...| Gasoline|  Gold|Abu Dhabi| Engine repaired|
|    dodge| charger|2022| 31252| 276604|    Sedan|        8|Automatic Transmi...| Gasoline|  Blue|Abu Dhabi| Minor scratches|
|  porsche| cayenne|2019| 74335| 278497|      SUV|        6|Automatic Transmi...| Gasoline|Silver|Abu Dhabi|Repainted bumper|
|chevrolet|corvette|2018|448226| 140105|    Coupe|        8|Automatic Transmi...| Gasoline|   Red|Abu Dhabi| Engine re

In [56]:
print("Пропущенные значения до обработки:")
for column in df.columns:
    null_count = df.filter(col(column).isNull()).count()
    print(f"{column}: {null_count}")

Пропущенные значения до обработки:
Make: 0
Model: 0
Year: 0
Price: 0
Mileage: 0
Body Type: 0
Cylinders: 80
Transmission: 0
Fuel Type: 0
Color: 0
Location: 0
Condition: 0


In [55]:
df_cleaned = df.dropna()

In [59]:
assembler = VectorAssembler(
    inputCols=["Price", "Mileage", "Year"],
    outputCol="raw_features"
)
df_cluster = assembler.transform(df_cleaned)

scaler = StandardScaler(inputCol="raw_features", outputCol="features", 
                       withStd=True, withMean=True)
scaler_model = scaler.fit(df_cluster)
df_scaled = scaler_model.transform(df_cluster)

kmeans = KMeans(featuresCol="features", k=4, seed=42)
kmeans_model = kmeans.fit(df_scaled)
kmeans_results = kmeans_model.transform(df_scaled)

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(kmeans_results)

print("Кластеризация автомобилей:")
print(f"Silhouette Score: {silhouette:.4f}")

Кластеризация автомобилей:
Silhouette Score: 0.4850


In [104]:
body_type_counts = df_cleaned.groupBy("Body Type").count().orderBy("count", ascending=False)
main_body_types = [row["Body Type"] for row in body_type_counts.limit(5).collect()]
df_filtered_body = df_cleaned.filter(col("Body Type").isin(main_body_types))

print(f"Данные после фильтрации типов кузова: {df_filtered_body.count()} строк")

indexer_body = StringIndexer(inputCol="Body Type", outputCol="label")
indexer_fuel = StringIndexer(inputCol="Fuel Type", outputCol="fuel_index")

assembler_class = VectorAssembler(
    inputCols=["Year", "Price", "Mileage", "fuel_index"],
    outputCol="features"
)

rf = RandomForestClassifier(featuresCol="features", labelCol="label", 
                           numTrees=50, maxBins=100, seed=42)

pipeline = Pipeline(stages=[indexer_body, indexer_fuel, assembler_class, rf])

train_data, test_data = df_filtered_body.randomSplit([0.7, 0.3], seed=42)

model = pipeline.fit(train_data)
predictions = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                             predictionCol="prediction")
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print("Классификация типа кузова:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("Распределение предсказаний:")
predictions.groupBy("Body Type", "prediction").count() \
    .orderBy("Body Type", "prediction") \
    .show()

Данные после фильтрации типов кузова: 8949 строк
Классификация типа кузова:
Accuracy: 0.5225
F1 Score: 0.4415
Распределение предсказаний:
+-------------+----------+-----+
|    Body Type|prediction|count|
+-------------+----------+-----+
|        Coupe|       0.0|  229|
|        Coupe|       1.0|   16|
|    Hatchback|       0.0|   71|
|    Hatchback|       1.0|   30|
|Pick Up Truck|       0.0|  132|
|Pick Up Truck|       1.0|    8|
|Pick Up Truck|       3.0|   12|
|          SUV|       0.0| 1137|
|          SUV|       1.0|  155|
|          SUV|       3.0|    2|
|        Sedan|       0.0|  595|
|        Sedan|       1.0|  208|
|        Sedan|       3.0|    2|
+-------------+----------+-----+



In [106]:
indexer_condition = StringIndexer(inputCol="Condition", outputCol="condition_index")

assembler_condition = VectorAssembler(
    inputCols=["Year", "Mileage", "condition_index"],
    outputCol="features_condition"
)

lr = LinearRegression(featuresCol="features_condition", labelCol="Price")

pipeline_condition = Pipeline(stages=[indexer_condition, assembler_condition, lr])

model_condition = pipeline_condition.fit(df_cleaned)
predictions_condition = model_condition.transform(df_cleaned)

evaluator_condition_rmse = RegressionEvaluator(labelCol="Price", 
                                              predictionCol="prediction", 
                                              metricName="rmse")
evaluator_condition_r2 = RegressionEvaluator(labelCol="Price", 
                                            predictionCol="prediction", 
                                            metricName="r2")

rmse_condition = evaluator_condition_rmse.evaluate(predictions_condition)
r2_condition = evaluator_condition_r2.evaluate(predictions_condition)

print("Анализ влияния состояния на цену:")
print(f"RMSE: {rmse_condition:.2f}")
print(f"R² Score: {r2_condition:.4f}")

print("\nСредняя цена по состояниям:")
df_cleaned.groupBy("Condition").agg(
    mean("Price").alias("avg_price"),
    count("*").alias("count")
).orderBy("avg_price", ascending=False).show()

25/09/29 21:05:31 WARN Instrumentation: [418ef15a] regParam is zero, which might cause numerical instability and overfitting.


Анализ влияния состояния на цену:
RMSE: 472678.14
R² Score: 0.0004

Средняя цена по состояниям:
+----------------+------------------+-----+
|       Condition|         avg_price|count|
+----------------+------------------+-----+
|Accident history| 262379.8944815039| 1649|
|       No damage| 253654.9388379205| 1635|
| Minor scratches|248859.99128811448| 1607|
|     Dented door|248809.92188431724| 1677|
|Repainted bumper| 233776.2171565687| 1667|
| Engine repaired|226921.49554896142| 1685|
+----------------+------------------+-----+

