In [1]:
import os

In [None]:
os.chdir("../")
%pwd

In [2]:
import pandas as pd

test_data = pd.read_csv('/Users/macbookpro/Documents/predict_publications/publications_prediction/data/test_data.csv')
train_data = pd.read_csv('/Users/macbookpro/Documents/predict_publications/publications_prediction/data/train_data.csv')
validation_data = pd.read_csv('/Users/macbookpro/Documents/predict_publications/publications_prediction/data/validation_data.csv')

In [3]:
# Convert 'timestamp' to a datetime format
train_data['date'] = pd.to_datetime(train_data['timestamp'], unit='s')

# Extracting the hour from the 'date' column
train_data['hour'] = train_data['date'].dt.hour

# Aggregate data based on 'hour', 'lon', and 'lat'
agg_columns = {
    'likescount': 'mean',
    'commentscount': 'mean',
    'symbols_cnt': 'mean',
    'words_cnt': 'mean',
    'hashtags_cnt': 'mean',
    'mentions_cnt': 'mean',
    'links_cnt': 'mean',
    'emoji_cnt': 'mean',
}

grouped_data = train_data.groupby(['timestamp', 'lon', 'lat', 'point', 'hour']).agg(agg_columns).reset_index()
grouped_data.head()


Unnamed: 0,timestamp,lon,lat,point,hour,likescount,commentscount,symbols_cnt,words_cnt,hashtags_cnt,mentions_cnt,links_cnt,emoji_cnt
0,1546300800,0.0,0.0,0101000020E61000000000000000000000000000000000...,0,31.666667,1.666667,51.333333,2.0,2.0,0.0,0.0,0.0
1,1546300800,30.136232,60.000054,0101000020E6100000B8E59619E0223E40ABB649C80100...,0,52.0,1.0,28.0,0.5,2.0,0.0,0.0,0.5
2,1546300800,30.138478,59.835705,0101000020E610000077D0A94773233E4097654065F8EA...,0,32.0,0.333333,46.0,2.333333,3.0,0.0,0.0,1.333333
3,1546300800,30.142969,60.023627,0101000020E6100000F5A5CFA399243E400B9A5B330603...,0,77.666667,3.333333,34.666667,2.666667,0.666667,0.0,0.0,1.666667
4,1546300800,30.142969,60.030359,0101000020E6100000F5A5CFA399243E40854A58CAE203...,0,19.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
grouped_data['publication_count'] = train_data.groupby(['timestamp', 'hour', 'lon', 'lat', 'point']).size().values
grouped_data.head()

Unnamed: 0,timestamp,lon,lat,point,hour,likescount,commentscount,symbols_cnt,words_cnt,hashtags_cnt,mentions_cnt,links_cnt,emoji_cnt,publication_count
0,1546300800,0.0,0.0,0101000020E61000000000000000000000000000000000...,0,31.666667,1.666667,51.333333,2.0,2.0,0.0,0.0,0.0,3
1,1546300800,30.136232,60.000054,0101000020E6100000B8E59619E0223E40ABB649C80100...,0,52.0,1.0,28.0,0.5,2.0,0.0,0.0,0.5,2
2,1546300800,30.138478,59.835705,0101000020E610000077D0A94773233E4097654065F8EA...,0,32.0,0.333333,46.0,2.333333,3.0,0.0,0.0,1.333333,3
3,1546300800,30.142969,60.023627,0101000020E6100000F5A5CFA399243E400B9A5B330603...,0,77.666667,3.333333,34.666667,2.666667,0.666667,0.0,0.0,1.666667,3
4,1546300800,30.142969,60.030359,0101000020E6100000F5A5CFA399243E40854A58CAE203...,0,19.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [5]:
# Drop 'timestamp' as it's strongly correlated with other time features and may cause data leakage
X_train = grouped_data.drop(['publication_count', 'timestamp', 'point'], axis=1)
y_train = grouped_data['publication_count']

In [6]:
# Convert the 'hour' column to a datetime format
test_data['date'] = pd.to_datetime(test_data['hour'], unit='s')

# Drop the original 'hour' column which contains the timestamp
test_data.drop(columns=['hour'], inplace=True)

# Extract the datetime features from the 'date' column
test_data['hour'] = test_data['date'].dt.hour
test_data['day'] = test_data['date'].dt.day
test_data['dayofweek'] = test_data['date'].dt.dayofweek
test_data['month'] = test_data['date'].dt.month

# Drop the 'date' column as it's not needed for prediction
test_data.drop(columns=['date'], inplace=True)

# Set 'point' as the index for both datasets
train_data.set_index('point', inplace=True)
test_data.set_index('point', inplace=True)

# List of features to create in the test dataset
features_to_create = ['likescount', 'commentscount', 'symbols_cnt', 'words_cnt', 
                      'hashtags_cnt', 'mentions_cnt', 'links_cnt', 'emoji_cnt']

# Aggregate the training dataset based on 'point' and compute the median for each feature
aggregated_data = train_data[features_to_create].groupby('point').median()

# Merge the test dataset with the aggregated training data on 'point'
test_data = test_data.join(aggregated_data, on='point', how='left')

# Reset index for both datasets after the operations
train_data.reset_index(inplace=True)
test_data.reset_index(inplace=True)

X_test = test_data.drop(['sum', 'point', 'error'], axis=1)
y_test = test_data['sum']
X_test = X_test[X_train.columns]


# Modelling

In [9]:
import statsmodels.api as sm
from statsmodels.tsa.api import ExponentialSmoothing
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming y_train is a pandas series with a datetime index, if not, we will need to adjust.
# If y_train is not defined in this context, please provide it.

# 1. Decompose the training data to observe trend and seasonality
decomposition = sm.tsa.seasonal_decompose(y_train, model='additive', period=24)  # assuming hourly data
fig = decomposition.plot()

# 2. Train the Exponential Smoothing model
# We will use additive trend and seasonality as it's common for this kind of data. 
# The seasonal period is set to 24, assuming the data is hourly.
model = ExponentialSmoothing(y_train, trend='add', seasonal='add', seasonal_periods=24)
fit = model.fit()

# 3. Forecast for the future periods
forecast_values = fit.forecast(steps=len(y_test))

# 4. Evaluate the forecasts
mse_es = mean_squared_error(y_test, forecast_values)
rmse_es = np.sqrt(mse_es)

mse_es, rmse_es



  return err.T @ err


KeyboardInterrupt: 