In [1]:
import numpy as np
import pandas as pd


In [2]:
data_df = pd.read_csv('Merged_Data_202101_202303_v3.csv')
data_df = data_df.iloc[: , 1:]
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3412875 entries, 0 to 3412874
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   station_id            int64  
 1   Net_Flow              int64  
 2   Split_hour            int64  
 3   is_holiday            bool   
 4   feels_like            float64
 5   weather_Clear         int64  
 6   weather_Clouds        int64  
 7   weather_Drizzle       int64  
 8   weather_Fog           int64  
 9   weather_Haze          int64  
 10  weather_Mist          int64  
 11  weather_Rain          int64  
 12  weather_Smoke         int64  
 13  weather_Snow          int64  
 14  weather_Thunderstorm  int64  
 15  day_of_week           int64  
dtypes: bool(1), float64(1), int64(14)
memory usage: 393.8 MB


In [3]:
data_df[:5]

Unnamed: 0,station_id,Net_Flow,Split_hour,is_holiday,feels_like,weather_Clear,weather_Clouds,weather_Drizzle,weather_Fog,weather_Haze,weather_Mist,weather_Rain,weather_Smoke,weather_Snow,weather_Thunderstorm,day_of_week
0,1,-1,22,False,-3.42,0,1,0,0,0,0,0,0,0,0,3
1,1,-1,21,False,-5.97,0,0,0,0,0,0,0,0,1,0,3
2,1,-1,16,False,-4.41,0,0,0,0,0,0,1,0,0,0,2
3,1,-1,16,False,-1.97,0,0,0,0,0,0,1,0,0,0,3
4,1,-1,15,False,-0.96,0,1,0,0,0,0,0,0,0,0,6


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = data_df[['station_id', 'Split_hour', 'day_of_week', 'is_holiday', 'feels_like']]
y = data_df['Net_Flow']


#X = pd.get_dummies(X, columns=['station_id', 'Split_hour', 'day_of_week', 'is_holiday', 'feels_like'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


In [9]:
from sklearn.metrics import mean_squared_error
import time

n_estimators_range = [10, 20, 100]  
max_depth_range = [5, 10, 15]    

best_mse = float('inf')
best_n = 0
best_depth = 0

for n in n_estimators_range:
    for depth in max_depth_range:
        start_time = time.time()
        model = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        time_inter = time.time() - start_time
        print(f"n_estimators: {n}, max_depth: {depth}, MSE: {mse: .4f}, Time: {time_inter: .4f}s")

        if mse < best_mse:
            best_mse = mse
            best_n = n
            best_depth = depth

print(f"Best MSE: {best_mse} with n_estimators = {best_n} and max_depth = {best_depth}")


n_estimators: 10, max_depth: 5, MSE:  7.0777, Time:  11.1475s
n_estimators: 10, max_depth: 10, MSE:  6.4625, Time:  20.6535s
n_estimators: 10, max_depth: 15, MSE:  6.1656, Time:  28.9572s
n_estimators: 20, max_depth: 5, MSE:  7.0779, Time:  22.2771s
n_estimators: 20, max_depth: 10, MSE:  6.4590, Time:  41.2716s
n_estimators: 20, max_depth: 15, MSE:  6.1401, Time:  57.8288s
n_estimators: 100, max_depth: 5, MSE:  7.0775, Time:  111.7889s
n_estimators: 100, max_depth: 10, MSE:  6.4559, Time:  206.7005s
n_estimators: 100, max_depth: 15, MSE:  6.1205, Time:  290.3190s
Best MSE: 6.1205495863304815 with n_estimators = 100 and max_depth = 15


In [None]:

next_24_hours = pd.date_range(start=merged_df.index[-1], periods=24, freq='H')

next_24_hours_data = {
    'station_id': [1]*24,  
    'hour': next_24_hours.hour,
    'day_of_week': next_24_hours.dayofweek,
    'is_holiday': next_24_hours.isin([pd.to_datetime(date) for date in holidays])
}
next_24_hours_df = pd.DataFrame(next_24_hours_data)

all_categories = pd.get_dummies(X_train).columns


next_24_hours_df = pd.get_dummies(next_24_hours_df)

next_24_hours_df = next_24_hours_df.reindex(columns=all_categories, fill_value=0)

predictions = model.predict(next_24_hours_df)

print(predictions)

[-0.04601017 -0.04601017 -0.04601017 -0.04601017 -0.04601017 -0.04601017
 -0.04601017 -0.04601017 -0.04601017 -0.04601017 -0.04601017 -0.04601017
 -0.04601017 -0.04601017 -0.04601017 -0.04601017 -0.04601017 -0.04601017
 -0.04601017 -0.04601017 -0.04601017 -0.04601017 -0.04601017 -0.04601017]


In [None]:
#data_df.to_csv('Merged_Data_202101_202303_v3.csv')

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("RandomForestTraining") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.memory.fraction", "0.9") \
    .getOrCreate()


24/04/19 20:31:07 WARN Utils: Your hostname, yuzhuorandeMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.4.42 instead (on interface en0)
24/04/19 20:31:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/04/19 20:31:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = spark.read.csv('Merged_Data_202101_202303_v3.csv', header=True, inferSchema=True)
df = df.select("station_id", "Split_hour", "day_of_week", "is_holiday", "feels_like", "Net_Flow")
df.printSchema()

24/04/19 20:31:19 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors

root
 |-- station_id: integer (nullable = true)
 |-- Split_hour: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- is_holiday: boolean (nullable = true)
 |-- feels_like: double (nullable = true)
 |-- Net_Flow: integer (nullable = true)



                                                                                

In [3]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

df = df.withColumn("is_holiday", col("is_holiday").cast("string"))
df = df.withColumn("Split_hour", col("Split_hour").cast("string"))

# First, ensure any categorical features are indexed if they're not already
indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index")
    for column in ["station_id", "Split_hour", "day_of_week", "is_holiday"]
]
# Now create a VectorAssembler to combine all features into one vector
assembler = VectorAssembler(
    inputCols=["station_id_index", "Split_hour_index", "day_of_week_index", "is_holiday_index", "feels_like"],
    outputCol="features"
)

pipeline = Pipeline(stages=indexers + [assembler])

model = pipeline.fit(df)
transformed_df = model.transform(df)

(train_data, test_data) = transformed_df.randomSplit([0.6, 0.4], seed=42)


                                                                                

In [4]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import time
# Possible values for numTrees and maxDepth
numTrees_list = [30]
maxDepth_list = [5, 10, 15]

# Initialize the evaluator
evaluator_mse = RegressionEvaluator(labelCol="Net_Flow", predictionCol="prediction", metricName="mse")

import pandas as pd
results = pd.DataFrame(columns=["numTrees", "maxDepth", "MSE", "Time"])

for num in numTrees_list:
    for depth in maxDepth_list:
        start_time = time.time()
        rf = RandomForestRegressor(featuresCol="features", labelCol="Net_Flow", numTrees=num, maxDepth=depth, maxBins=1000, seed=42)
        rf_model = rf.fit(train_data)
        
        # Make predictions
        predictions = rf_model.transform(test_data)
        
        # Evaluate the model
        mse = evaluator_mse.evaluate(predictions)
        time_inter = time.time() - start_time
        print(f"numTrees: {num}, maxDepth: {depth}, MSE: {mse: .4f}, Time: {time_inter: .4f}s")
        # Print and collect the results
        new_row = pd.DataFrame({"numTrees": [num], "maxDepth": [depth], "MSE": [mse], "Time": [time_inter]})
        results = pd.concat([results, new_row], ignore_index=True)

# Display results
#print(results)


  results = pd.concat([results, new_row], ignore_index=True)


numTrees: 30, maxDepth: 5, MSE:  6.8612, Time:  30.4737s


24/04/19 20:32:29 WARN DAGScheduler: Broadcasting large task binary with size 1374.0 KiB
24/04/19 20:32:35 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
24/04/19 20:32:42 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/04/19 20:32:50 WARN DAGScheduler: Broadcasting large task binary with size 6.6 MiB
24/04/19 20:33:04 WARN DAGScheduler: Broadcasting large task binary with size 11.2 MiB
24/04/19 20:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1333.8 KiB
                                                                                

numTrees: 30, maxDepth: 10, MSE:  6.1127, Time:  92.3623s


24/04/19 20:34:18 WARN DAGScheduler: Broadcasting large task binary with size 1374.0 KiB
24/04/19 20:34:25 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
24/04/19 20:34:34 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/04/19 20:34:44 WARN DAGScheduler: Broadcasting large task binary with size 6.6 MiB
24/04/19 20:34:57 WARN DAGScheduler: Broadcasting large task binary with size 11.2 MiB
24/04/19 20:35:18 WARN DAGScheduler: Broadcasting large task binary with size 1333.8 KiB
24/04/19 20:35:22 WARN DAGScheduler: Broadcasting large task binary with size 13.6 MiB
24/04/19 20:35:35 WARN DAGScheduler: Broadcasting large task binary with size 1803.1 KiB
24/04/19 20:35:39 WARN DAGScheduler: Broadcasting large task binary with size 12.8 MiB
24/04/19 20:35:50 WARN DAGScheduler: Broadcasting large task binary with size 1811.1 KiB
24/04/19 20:35:53 WARN DAGScheduler: Broadcasting large task binary with size 12.8 MiB
24/04/19 20:36:00 WARN DAGScheduler: B

numTrees: 30, maxDepth: 15, MSE:  5.8480, Time:  420.5069s


                                                                                

### Spark with weather data

In [15]:
results

Unnamed: 0,numTrees,maxDepth,MSE,Time
0,10,5,6.908905,23.80379
1,10,10,6.159883,34.212571
2,10,15,5.934817,104.68024
3,20,5,6.894684,26.5304
4,20,10,6.144721,56.503823
5,20,15,5.867577,239.560308
6,100,5,6.854832,103.332939


In [6]:
results

Unnamed: 0,numTrees,maxDepth,MSE,Time
0,30,5,6.861225,30.473653
1,30,10,6.112688,92.362313
2,30,15,5.847989,420.506886


### Spark without weather data

In [None]:
print(results)##without weather

   numTrees maxDepth       MSE
0         5        3  7.155212
1         5        5  6.779841
2         5       10  6.269571
3         5       15  6.215765
4        10        3  7.148010
5        10        5  6.866187
6        10       10  6.261236
7        10       15  6.197606
8        20        3  7.095159
9        20        5  6.787787
10       20       10  6.257980
11       20       15  6.187825


In [8]:
#spark.stop()

#