In [1]:
import os

In [2]:
os.chdir("../")
%pwd

'/Users/macbookpro/Documents/predict_publications/publications_prediction'

In [3]:
import pandas as pd

test_data = pd.read_csv('/Users/macbookpro/Documents/predict_publications/publications_prediction/data/test_data.csv')
train_data = pd.read_csv('/Users/macbookpro/Documents/predict_publications/publications_prediction/data/train_data.csv')
validation_data = pd.read_csv('/Users/macbookpro/Documents/predict_publications/publications_prediction/data/validation_data.csv')

In [4]:
# Convert 'timestamp' to a datetime format
train_data['date'] = pd.to_datetime(train_data['timestamp'], unit='s')

# Extracting the hour from the 'date' column
train_data['hour'] = train_data['date'].dt.hour

# Aggregate data based on 'hour', 'lon', and 'lat'
agg_columns = {
    'likescount': 'mean',
    'commentscount': 'mean',
    'symbols_cnt': 'mean',
    'words_cnt': 'mean',
    'hashtags_cnt': 'mean',
    'mentions_cnt': 'mean',
    'links_cnt': 'mean',
    'emoji_cnt': 'mean',
}

grouped_data = train_data.groupby(['timestamp', 'lon', 'lat', 'point', 'hour']).agg(agg_columns).reset_index()
grouped_data.head()


Unnamed: 0,timestamp,lon,lat,point,hour,likescount,commentscount,symbols_cnt,words_cnt,hashtags_cnt,mentions_cnt,links_cnt,emoji_cnt
0,1546300800,0.0,0.0,0101000020E61000000000000000000000000000000000...,0,31.666667,1.666667,51.333333,2.0,2.0,0.0,0.0,0.0
1,1546300800,30.136232,60.000054,0101000020E6100000B8E59619E0223E40ABB649C80100...,0,52.0,1.0,28.0,0.5,2.0,0.0,0.0,0.5
2,1546300800,30.138478,59.835705,0101000020E610000077D0A94773233E4097654065F8EA...,0,32.0,0.333333,46.0,2.333333,3.0,0.0,0.0,1.333333
3,1546300800,30.142969,60.023627,0101000020E6100000F5A5CFA399243E400B9A5B330603...,0,77.666667,3.333333,34.666667,2.666667,0.666667,0.0,0.0,1.666667
4,1546300800,30.142969,60.030359,0101000020E6100000F5A5CFA399243E40854A58CAE203...,0,19.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
grouped_data['publication_count'] = train_data.groupby(['timestamp', 'hour', 'lon', 'lat', 'point']).size().values
grouped_data.head()

Unnamed: 0,timestamp,lon,lat,point,hour,likescount,commentscount,symbols_cnt,words_cnt,hashtags_cnt,mentions_cnt,links_cnt,emoji_cnt,publication_count
0,1546300800,0.0,0.0,0101000020E61000000000000000000000000000000000...,0,31.666667,1.666667,51.333333,2.0,2.0,0.0,0.0,0.0,3
1,1546300800,30.136232,60.000054,0101000020E6100000B8E59619E0223E40ABB649C80100...,0,52.0,1.0,28.0,0.5,2.0,0.0,0.0,0.5,2
2,1546300800,30.138478,59.835705,0101000020E610000077D0A94773233E4097654065F8EA...,0,32.0,0.333333,46.0,2.333333,3.0,0.0,0.0,1.333333,3
3,1546300800,30.142969,60.023627,0101000020E6100000F5A5CFA399243E400B9A5B330603...,0,77.666667,3.333333,34.666667,2.666667,0.666667,0.0,0.0,1.666667,3
4,1546300800,30.142969,60.030359,0101000020E6100000F5A5CFA399243E40854A58CAE203...,0,19.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [6]:
# Drop 'timestamp' as it's strongly correlated with other time features and may cause data leakage
X_train = grouped_data.drop(['publication_count', 'timestamp', 'point'], axis=1)
y_train = grouped_data['publication_count']

In [7]:
# Convert the 'hour' column to a datetime format
test_data['date'] = pd.to_datetime(test_data['hour'], unit='s')

# Drop the original 'hour' column which contains the timestamp
test_data.drop(columns=['hour'], inplace=True)

# Extract the datetime features from the 'date' column
test_data['hour'] = test_data['date'].dt.hour
test_data['day'] = test_data['date'].dt.day
test_data['dayofweek'] = test_data['date'].dt.dayofweek
test_data['month'] = test_data['date'].dt.month

# Drop the 'date' column as it's not needed for prediction
test_data.drop(columns=['date'], inplace=True)

# Set 'point' as the index for both datasets
train_data.set_index('point', inplace=True)
test_data.set_index('point', inplace=True)

# List of features to create in the test dataset
features_to_create = ['likescount', 'commentscount', 'symbols_cnt', 'words_cnt', 
                      'hashtags_cnt', 'mentions_cnt', 'links_cnt', 'emoji_cnt']

# Aggregate the training dataset based on 'point' and compute the median for each feature
aggregated_data = train_data[features_to_create].groupby('point').median()

# Merge the test dataset with the aggregated training data on 'point'
test_data = test_data.join(aggregated_data, on='point', how='left')

# Reset index for both datasets after the operations
train_data.reset_index(inplace=True)
test_data.reset_index(inplace=True)

X_test = test_data.drop(['sum', 'point', 'error'], axis=1)
y_test = test_data['sum']
X_test = X_test[X_train.columns]


In [8]:
# Modelling

In [9]:
from keras.models import Sequential, load_model
from keras.layers import SimpleRNN, Dense, Dropout
from keras.optimizers import Adam
from keras.regularizers import l2
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from numpy.lib.stride_tricks import as_strided
import matplotlib.pyplot as plt
import numpy as np

In [10]:
# ---- [1. Data Scaling] ----
def scale_data(X_train, X_test):
    """
    Scale the training and test data to [0, 1].
    Returns the scaled data and the scaler.
    """
    scaler = MinMaxScaler(feature_range=(0, 1))
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler

In [11]:
# ---- [2. Creating a Windowed Dataset] ----
def create_windowed_dataset(data, window_size):
    """
    Transforms the data into a windowed dataset.
    """
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i : i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)

In [12]:
# ---- [3. Model Building] ----
def build_rnn_model(input_shape, dropout_rate, l2_reg):
    """
    Build and return an RNN model.
    """
    model = Sequential()
    model.add(SimpleRNN(50, activation='relu', return_sequences=True, kernel_regularizer=l2(l2_reg), input_shape=input_shape))
    model.add(Dropout(dropout_rate))
    model.add(SimpleRNN(50, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [13]:
# ---- [4. Model Training] ----
def train_model(model, X_train, y_train):
    """
    Train the model using TimeSeriesSplit.
    Returns the trained model.
    """
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, val_index in tscv.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        model.fit(X_train_fold, y_train_fold, epochs=50, validation_data=(X_val_fold, y_val_fold), verbose=0)
    return model

In [14]:
def evaluate_model(y_true, y_pred):
    """
    Evaluate the model's performance.
    
    Returns:
    - RMSE
    - Relative Error
    """
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    epsilon = 1e-10
    errors = np.abs(y_pred.flatten() - y_true) / (y_pred.flatten() + epsilon)
    avg_relative_error = np.mean(errors)
    return rmse, avg_relative_error

In [15]:
# ---- [6. Visualization of Loss] ----
def plot_loss(model):
    """
    Plot the training and validation loss.
    """
    plt.plot(model.history.history['loss'], label='Train Loss')
    plt.plot(model.history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    plt.show()

In [16]:
# ---- [7. Visualization of Predicted vs Actual] ----
def plot_predictions(y_true, y_pred):
    """
    Plot actual vs predicted values.
    """
    plt.figure(figsize=(15, 6))
    plt.plot(y_true, label='Actual', color='blue')
    plt.plot(y_pred, label='Predicted', color='red', alpha=0.7)
    plt.title('Actual vs Predicted Values')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

# --- Main Execution ---

In [18]:
# Window size definition
window_size = 10

X_train_scaled, X_test_scaled, _ = scale_data(X_train, X_test)
X_train_windows, y_train_windows = create_windowed_dataset(X_train_scaled, window_size)
X_test_windows, y_test_windows = create_windowed_dataset(X_test_scaled, window_size)

In [19]:

model = build_rnn_model((window_size, X_train_scaled.shape[1]), 0.2, 0.001)
trained_model = train_model(model, X_train_windows, y_train_windows)


In [None]:
y_pred = trained_model.predict(X_test_windows)
rmse, relative_error = evaluate_model(y_test_windows, y_pred)
print(f"RMSE: {rmse}")
print(f"Relative Error: {relative_error}")
plot_loss(trained_model)
plot_predictions(y_test_windows, y_pred)