In [34]:
import os
import time
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType, DoubleType

In [35]:
# Crear carpeta temporal Spark
temp_path = os.path.join(os.getcwd(), 'spark-temp')
os.makedirs(temp_path, exist_ok=True)

# Definir variables de entorno
os.environ['JAVA_HOME'] = os.environ['CONDA_PREFIX'] + '\Library'
os.environ['SPARK_LOCAL_DIRS'] = temp_path

print('JAVA_HOME:', os.environ.get('JAVA_HOME'))
print('SPARK_LOCAL_DIRS:', os.environ.get('SPARK_LOCAL_DIRS'))

JAVA_HOME: C:\Users\TESTER\anaconda3\envs\btc_portfolio\Library
SPARK_LOCAL_DIRS: C:\Users\TESTER\Desktop\Laboral\GIT\btc-3-asset-portfolio-extension\notebooks\spark-temp


In [36]:
# Crear Spark Session y medir tiempo
start_time = time.time()

spark = SparkSession.builder \
.appName('btcproject') \
.config('spark.driver.memory', '8g') \
.config('spark.executor.memory', '8g') \
.config('spark.local.dir', temp_path) \
.getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

end_time = time.time()
print('Spark Version:', spark.version)
print(f'Tiempo total en crear SparkSession: {round(end_time - start_time, 2)} segundos')

Spark Version: 3.5.4
Tiempo total en crear SparkSession: 0.01 segundos


# Data Cleaning
## Btc Dataset

In [37]:
df_btc = spark.read \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.csv('../data/Bitcoin Historical Data.csv')

In [38]:
df_btc.show(20)
df_btc.printSchema()

+----------+---------+---------+---------+--------+-----+--------+
|      Date|    Price|     Open|     High|     Low| Vol.|Change %|
+----------+---------+---------+---------+--------+-----+--------+
|04/01/2025| 84,931.6| 82,548.6| 88,377.3|74,524.2|1.29M|   2.89%|
|03/01/2025| 82,548.8| 84,353.4| 94,986.5|76,677.1|2.37M|  -2.17%|
|02/01/2025| 84,381.2|102,421.3|102,770.8|78,329.8|2.40M| -17.62%|
|01/01/2025|102,424.2| 93,557.3|109,228.6|89,664.8|2.72M|   9.48%|
|12/01/2024| 93,557.2| 96,404.7|108,244.9|91,522.3|4.41M|  -2.95%|
|11/01/2024| 96,405.7| 70,278.7| 99,617.4|66,834.0|4.16M|  37.17%|
|10/01/2024| 70,281.8| 63,329.9| 73,569.4|59,075.7|2.56M|  10.96%|
|09/01/2024| 63,339.2| 58,975.7| 66,440.7|52,644.6|2.34M|   7.39%|
|08/01/2024| 58,978.6| 64,625.7| 65,587.9|49,486.9|2.56M|  -8.74%|
|07/01/2024| 64,626.0| 62,768.8| 70,000.2|53,883.4|2.06M|   2.98%|
|06/01/2024| 62,754.3| 67,533.9| 71,956.5|58,589.9|1.60M|  -7.07%|
|05/01/2024| 67,530.1| 60,665.0| 71,872.0|56,643.5|2.10M|  11.

## About the Bitcoin Dataset

This dataset contains **monthly data** for Bitcoin from **August 1, 2010** to **April 1, 2025**.

> ⚠️ **Note**: The data for **April 2025** is not final, as this project was created during the same month. Interpret that row with caution.

---

### Key Information:

- Each row represents **a full calendar month**.
- The `Date` column uses the **first day of the month** as a label (e.g. `2025-04-01` refers to data from April 2025).
- The values shown reflect **the entire month**, not just that specific day.

---

### Column Descriptions:

| Column   | Description                                                |
|----------|------------------------------------------------------------|
| `Date`   | First day of the month (used as time label)                |
| `Open`   | Price at the start of the month                            |
| `Price`  | Closing price at the end of the month                      |
| `High`   | Maximum price reached during the month                     |
| `Low`    | Minimum price reached during the month                     |
| `Change` | % change between the opening and closing price of the month|

> ℹ️ Later on, I rename the column `Price` to `Close` to make it clearer in context.


In [39]:
df_btc = df_btc.withColumn("Date", F.to_date("Date", "MM/dd/yyyy"))

df_btc = df_btc.select(#F.date_format("Date", "dd-MM-yyyy").alias("Date"), Cambia a String y no quiero
              F.col("Date"),
              F.col("Price").alias("Close"),
              F.col("Open"),
              F.col("High"),
              F.col("Low"),
              F.col("Change %").alias("Change")
             )
df_btc.show()

df_btc.printSchema()

df_btc.select(
    F.min("Date").alias("Fecha mínima"),
    F.max("Date").alias("Fecha máxima")
).show()


+----------+---------+---------+---------+--------+-------+
|      Date|    Close|     Open|     High|     Low| Change|
+----------+---------+---------+---------+--------+-------+
|2025-04-01| 84,931.6| 82,548.6| 88,377.3|74,524.2|  2.89%|
|2025-03-01| 82,548.8| 84,353.4| 94,986.5|76,677.1| -2.17%|
|2025-02-01| 84,381.2|102,421.3|102,770.8|78,329.8|-17.62%|
|2025-01-01|102,424.2| 93,557.3|109,228.6|89,664.8|  9.48%|
|2024-12-01| 93,557.2| 96,404.7|108,244.9|91,522.3| -2.95%|
|2024-11-01| 96,405.7| 70,278.7| 99,617.4|66,834.0| 37.17%|
|2024-10-01| 70,281.8| 63,329.9| 73,569.4|59,075.7| 10.96%|
|2024-09-01| 63,339.2| 58,975.7| 66,440.7|52,644.6|  7.39%|
|2024-08-01| 58,978.6| 64,625.7| 65,587.9|49,486.9| -8.74%|
|2024-07-01| 64,626.0| 62,768.8| 70,000.2|53,883.4|  2.98%|
|2024-06-01| 62,754.3| 67,533.9| 71,956.5|58,589.9| -7.07%|
|2024-05-01| 67,530.1| 60,665.0| 71,872.0|56,643.5| 11.31%|
|2024-04-01| 60,666.6| 71,329.3| 72,710.8|59,228.7|-14.95%|
|2024-03-01| 71,332.0| 61,157.3| 73,740.

In [40]:
df_btc = df_btc.withColumn(
    "block_reward",
    F.when(F.col("Date") <= F.lit("2012-11-01"), 50)
     .when((F.col("Date") > F.lit("2012-11-01")) & (F.col("Date") <= F.lit("2016-06-01")), 25)
     .when((F.col("Date") > F.lit("2016-06-01")) & (F.col("Date") <= F.lit("2020-04-01")), 12.5)
     .when((F.col("Date") > F.lit("2020-04-01")) & (F.col("Date") <= F.lit("2024-04-01")), 6.25)
     .otherwise(3.125)
)

In [41]:
df_btc.orderBy(F.col("Date").desc()).toPandas().head(13)

Unnamed: 0,Date,Close,Open,High,Low,Change,block_reward
0,2025-04-01,84931.6,82548.6,88377.3,74524.2,2.89%,3.125
1,2025-03-01,82548.8,84353.4,94986.5,76677.1,-2.17%,3.125
2,2025-02-01,84381.2,102421.3,102770.8,78329.8,-17.62%,3.125
3,2025-01-01,102424.2,93557.3,109228.6,89664.8,9.48%,3.125
4,2024-12-01,93557.2,96404.7,108244.9,91522.3,-2.95%,3.125
5,2024-11-01,96405.7,70278.7,99617.4,66834.0,37.17%,3.125
6,2024-10-01,70281.8,63329.9,73569.4,59075.7,10.96%,3.125
7,2024-09-01,63339.2,58975.7,66440.7,52644.6,7.39%,3.125
8,2024-08-01,58978.6,64625.7,65587.9,49486.9,-8.74%,3.125
9,2024-07-01,64626.0,62768.8,70000.2,53883.4,2.98%,3.125


## Block Reward Column

To account for Bitcoin’s changing monetary policy, I added a `block_reward` column that reflects the number of BTC miners received per block over time.

---

### Halving Schedule:

| Halving | Date             | Block Reward |
|---------|------------------|--------------|
| 1st     | Nov 28, 2012     | 25 BTC       |
| 2nd     | Jul 9, 2016      | 12.5 BTC     |
| 3rd     | May 11, 2020     | 6.25 BTC     |
| 4th     | Apr 20, 2024     | 3.125 BTC    |

---

### Logic:

- My dataset is monthly (each row = 1st day of the month).
- Since the 2012 and 2024 halvings occurred at the **end of the month**, I assigned those entire months (`2012-11` and `2024-04`) to the **previous reward**.
- This better reflects the fact that most of the blocks mined during those months still followed the old reward.

---

### Summary:

| Period                 | Assigned Block Reward |
|------------------------|------------------------|
| ≤ November 2012        | 50 BTC                 |
| December 2012 – June 2016 | 25 BTC             |
| July 2016 – April 2020    | 12.5 BTC            |
| May 2020 – April 2024     | 6.25 BTC            |
| ≥ May 2024               | 3.125 BTC            |

This column will help in analyzing how price and volatility behave across halving cycles.


In [42]:
df_btc.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Close: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Change: string (nullable = true)
 |-- block_reward: double (nullable = false)



In [43]:
cols_to_convert = ["Close", "Open", "High", "Low"]

for col in cols_to_convert:
    df_btc = df_btc.withColumn(col, F.regexp_replace(F.col(col), ",", "").cast(DoubleType()))

df_btc = df_btc.withColumn(
    "Change",
    F.regexp_replace(F.col("Change"), "%", "").cast(DoubleType())
)
df_btc.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Close: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Change: double (nullable = true)
 |-- block_reward: double (nullable = false)



In [44]:
df_btc.show(10)

+----------+--------+--------+--------+-------+------+------------+
|      Date|   Close|    Open|    High|    Low|Change|block_reward|
+----------+--------+--------+--------+-------+------+------------+
|2025-04-01| 84931.6| 82548.6| 88377.3|74524.2|  2.89|       3.125|
|2025-03-01| 82548.8| 84353.4| 94986.5|76677.1| -2.17|       3.125|
|2025-02-01| 84381.2|102421.3|102770.8|78329.8|-17.62|       3.125|
|2025-01-01|102424.2| 93557.3|109228.6|89664.8|  9.48|       3.125|
|2024-12-01| 93557.2| 96404.7|108244.9|91522.3| -2.95|       3.125|
|2024-11-01| 96405.7| 70278.7| 99617.4|66834.0| 37.17|       3.125|
|2024-10-01| 70281.8| 63329.9| 73569.4|59075.7| 10.96|       3.125|
|2024-09-01| 63339.2| 58975.7| 66440.7|52644.6|  7.39|       3.125|
|2024-08-01| 58978.6| 64625.7| 65587.9|49486.9| -8.74|       3.125|
|2024-07-01| 64626.0| 62768.8| 70000.2|53883.4|  2.98|       3.125|
+----------+--------+--------+--------+-------+------+------------+
only showing top 10 rows



In [45]:
df_extended = spark.read \
    .option("header", True) \
    .option("sep", ",") \
    .option("inferSchema", True) \
    .csv('../data/final_df_extended.csv')

df_extended = df_extended.withColumn("Date", 
    F.to_date(F.concat(F.col("month"), F.lit("-01")), "yyyy-MM-dd"))

df_extended = df_extended.drop("month")


In [46]:
df_extended.show(10)

Py4JJavaError: An error occurred while calling o565.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 51.0 failed 1 times, most recent failure: Lost task 0.0 in stage 51.0 (TID 45) (DESKTOP-1JVP2OE executor driver): org.apache.spark.SparkUpgradeException: [INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER] You may get a different result due to the upgrading to Spark >= 3.0:
Fail to parse '2014-09-01 00:00:00-01' in the new parser. You can set "spark.sql.legacy.timeParserPolicy" to "LEGACY" to restore the behavior before Spark 3.0, or set to "CORRECTED" and treat it as an invalid datetime string.
	at org.apache.spark.sql.errors.ExecutionErrors.failToParseDateTimeInNewParserError(ExecutionErrors.scala:54)
	at org.apache.spark.sql.errors.ExecutionErrors.failToParseDateTimeInNewParserError$(ExecutionErrors.scala:48)
	at org.apache.spark.sql.errors.ExecutionErrors$.failToParseDateTimeInNewParserError(ExecutionErrors.scala:218)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:142)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:135)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:195)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.time.format.DateTimeParseException: Text '2014-09-01 00:00:00-01' could not be parsed, unparsed text found at index 10
	at java.base/java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:2049)
	at java.base/java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1874)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:193)
	... 20 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4333)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4323)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4321)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4321)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3539)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at jdk.internal.reflect.GeneratedMethodAccessor70.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkUpgradeException: [INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER] You may get a different result due to the upgrading to Spark >= 3.0:
Fail to parse '2014-09-01 00:00:00-01' in the new parser. You can set "spark.sql.legacy.timeParserPolicy" to "LEGACY" to restore the behavior before Spark 3.0, or set to "CORRECTED" and treat it as an invalid datetime string.
	at org.apache.spark.sql.errors.ExecutionErrors.failToParseDateTimeInNewParserError(ExecutionErrors.scala:54)
	at org.apache.spark.sql.errors.ExecutionErrors.failToParseDateTimeInNewParserError$(ExecutionErrors.scala:48)
	at org.apache.spark.sql.errors.ExecutionErrors$.failToParseDateTimeInNewParserError(ExecutionErrors.scala:218)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:142)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:135)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:195)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: java.time.format.DateTimeParseException: Text '2014-09-01 00:00:00-01' could not be parsed, unparsed text found at index 10
	at java.base/java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:2049)
	at java.base/java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1874)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:193)
	... 20 more


In [101]:
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql import Window

# Fechas de los halvings
halvings = {
    "2012-11-01": 25,
    "2016-07-01": 12.5,
    "2020-05-01": 6.25,
    "2024-04-01": 3.125
}

# Ventanas que quieres analizar
months_after = [1, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 38, 46]

results = []

for halving_date, reward in halvings.items():
    halving_close = df_btc.filter(F.col("Date") == halving_date).select("Close").collect()[0][0]

    for m in months_after:
        target_date = pd.to_datetime(halving_date) + pd.DateOffset(months=m)
        target_date = target_date.strftime("%Y-%m-01")  # formatear como string yyyy-MM-01

        df_target = df_btc.filter(F.col("Date") == F.lit(target_date))

        if df_target.count() > 0:
            target_close = df_target.select("Close").collect()[0][0]
            ret = (target_close / halving_close - 1) * 100
            results.append([halving_date, reward, m, ret])
        else:
            results.append([halving_date, reward, m, None])  # No hay data todavía

# Crear pandas dataframe
df_performance = pd.DataFrame(results, columns=["halving_date", "block_reward", "months_after", "return_%"])

display(df_performance)


Unnamed: 0,halving_date,block_reward,months_after,return_%
0,2012-11-01,25.0,1,7.142857
1,2012-11-01,25.0,3,165.079365
2,2012-11-01,25.0,6,922.222222
3,2012-11-01,25.0,9,1019.047619
4,2012-11-01,25.0,12,9469.047619
5,2012-11-01,25.0,15,4454.761905
6,2012-11-01,25.0,18,4883.333333
7,2012-11-01,25.0,21,3723.809524
8,2012-11-01,25.0,24,2875.396825
9,2012-11-01,25.0,27,1916.666667


In [108]:
import pandas as pd

# Precio base (Close justo antes del halving - marzo 2024)
precio_base = 60666.60

# Ratio de reducción de retornos entre ciclo 2020 y 2024
reduction_ratio = 294.49 / 40.00  # ≈ 7.36

# Retornos reales ciclo 2020 desde mes 1 hasta 46
returns_2020 = {
    1: -3.38,
    3: 23.16,
    6: 108.34,
    9: 377.68,
    12: 294.49,
    15: 398.48,
    18: 501.63,
    21: 356.79,
    24: 236.27,
    27: 111.99,
    30: 81.54,
    38: 209.18,
    46: 654.45
}

# Aplicar reducción para predecir retornos 2024
retornos_2024 = {k: round(v / reduction_ratio, 2) for k, v in returns_2020.items()}

# Calcular precio final estimado
resultados = []
for meses, retorno in retornos_2024.items():
    precio_estimado = round(precio_base * (1 + retorno / 100), 2)
    resultados.append({
        'months_after': meses,
        'return_%': retorno,
        'projected_price_usd': precio_estimado
    })

# Crear DataFrame
df_prediccion_2024 = pd.DataFrame(resultados).sort_values('months_after').reset_index(drop=True)

# Mostrar tabla final
display(df_prediccion_2024)


Unnamed: 0,months_after,return_%,projected_price_usd
0,1,-0.46,60387.53
1,3,3.15,62577.6
2,6,14.72,69596.72
3,9,51.3,91788.57
4,12,40.0,84933.24
5,15,54.12,93499.36
6,18,68.14,102004.82
7,21,48.46,90065.63
8,24,32.09,80134.51
9,27,15.21,69893.99
