In [None]:
import pandas as pd
from pandas import DataFrame
from matplotlib import pyplot
from statsmodels.tsa.arima.model import ARIMA

df = pd.read_csv('data/DailyDelhiClimateTrain.csv', low_memory=False)
print(df.head())
df.describe()
#print number of rows
print("number of rows: " + str(df.shape[0]))


In [None]:
df.plot()
pyplot.show()

In [None]:
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
model = ARIMA(df['meantemp'], order=(5,1,0))
model_fit = model.fit()
# summary of fit model
print(model_fit.summary())
# line plot of residuals
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()
# density plot of residuals
residuals.plot(kind='kde')
pyplot.show()
# summary stats of residuals
print(residuals.describe())

# Create model to predict temperature

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.linear_model import LinearRegression
import shap

# Assuming df is already loaded with the 'meantemp', 'date', and 'label_column' columns
# df = pd.read_csv('your_data.csv')

# Ensure 'date' column is of datetime type
df['date'] = pd.to_datetime(df['date'])

# Split into train and test sets
X = df['meantemp'].values
labels = ['date', 'meantemp', 'humidity', 'wind_speed', 'meanpressure']
dates = df['date'].values
size = int(len(X) * 0.95)
train, test = X[0:size], X[size:len(X)]
train_dates, test_dates = dates[0:size], dates[size:len(X)]

history = [x for x in train]
predictions = list()

# Walk-forward validation
for t in range(len(test)):
    model = ARIMA(history, order=(5,1,0))
    model_fit = model.fit()
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))

# Evaluate forecasts
plt.figure(figsize=(12, 6))
plt.plot(test_dates, test, label='Test', color='blue')
plt.plot(test_dates, predictions, color='red', label='Predictions')
# Add description to plot
plt.title('Mean Temperature')
plt.xlabel('Date')
plt.ylabel('Temperature')
plt.legend()
plt.show()

# Create and fit the final ARIMA model
model = ARIMA(X, order=(5,1,0))
model_fit = model.fit()

# Generate lagged features for SHAP
def create_lagged_features(series, labels, lag=5):
    X, y, feature_labels = [], [], []
    for i in range(lag, len(series)):
        X.append(series[i-lag:i])
        y.append(series[i])
        feature_labels.append(labels[i-lag:i])  # Append the labels corresponding to the lagged values
    return np.array(X), np.array(y), feature_labels

lag = 5
X_lagged, y_lagged, feature_labels = create_lagged_features(X, labels, lag=lag)

# Fit a simple linear regression model on lagged features
lr_model = LinearRegression()
lr_model.fit(X_lagged, y_lagged)

# Create SHAP explainer
explainer = shap.LinearExplainer(lr_model, X_lagged)
shap_values = explainer.shap_values(X_lagged)

# Use real labels from df for feature names
feature_names = [label for label_set in feature_labels for label in label_set]

# Plot SHAP values
shap.summary_plot(shap_values, X_lagged, feature_names=feature_names[:X_lagged.shape[1]])



# evaluate with SHAP