<a href="https://colab.research.google.com/github/comparativechrono/Rephasing-of-Seasonal-Birth-Rates-in-the-United-Kingdom-/blob/main/Figures/Figure_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Time series analysis

Install required packages

In [1]:
install.packages("wesanderson")
install.packages("ggrepel")


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



## Read in data for analysis

In [2]:
# Reading in the data from Github
data <- read.csv("https://raw.githubusercontent.com/comparativechrono/Rephasing-of-Seasonal-Birth-Rates-in-the-United-Kingdom-/main/time-series/1953-74.csv", header = TRUE)
data2 <- read.csv("https://raw.githubusercontent.com/comparativechrono/Rephasing-of-Seasonal-Birth-Rates-in-the-United-Kingdom-/main/time-series/1976-2014.csv", header = TRUE)


## Cross correlation analysis

In [3]:
library(ggplot2)
library(wesanderson)
library(ggrepel)

# Calculate ccf significance threshold
n1 <- nrow(data)
n2 <- nrow(data2)
sig_level1 <- 1/sqrt(n1)
sig_level2 <- 1/sqrt(n2)

# Function to calculate ccf values and return a data frame with the results
calculate_ccf <- function(data, variable_name, BR_name) {
  ccf_values <- ccf(data[[variable_name]], data[[BR_name]], lag.max=24, plot=FALSE)
  ccf_data <- data.frame(lag = -24:0, acf = ccf_values$acf[1:25], Variable = variable_name)

  # Restrict the data to lags between -24 and -6 to find the maximum value
  restricted_data <- subset(ccf_data, lag <= -6 & lag >= -24)
  max_ccf <- max(restricted_data$acf)
  max_lag <- restricted_data$lag[which.max(restricted_data$acf)]

  ccf_data$max_ccf <- max_ccf
  ccf_data$max_lag <- max_lag
  return(ccf_data)
}

# Step 1: Calculate ccf values for all combinations of data sets and variables
ccf_data1_light <- calculate_ccf(data, "light", "BR")
ccf_data1_temp <- calculate_ccf(data, "temp", "BR")
ccf_data2_light <- calculate_ccf(data2, "light", "BR")
ccf_data2_temp <- calculate_ccf(data2, "temp", "BR")

# Step 2: Add data set identifiers and merge data into a single data frame
ccf_data1_light$data_set <- "1953 - 1974"
ccf_data1_temp$data_set <- "1953 - 1974"
ccf_data2_light$data_set <- "1976 - 2014"
ccf_data2_temp$data_set <- "1976 - 2014"

ccf_data <- rbind(ccf_data1_light, ccf_data1_temp, ccf_data2_light, ccf_data2_temp)
ccf_data$sig_level <- ifelse(ccf_data$data_set == "1953 - 1974", sig_level1, sig_level2)

options(repr.plot.width = 8, repr.plot.height = 5)

# Step 3: Create the plot
my_plot <- ggplot(ccf_data, aes(x = lag, y = acf, color = Variable)) +
  geom_hline(yintercept = 0, linetype="dotted", color="grey", linewidth = 1) +
  geom_vline(xintercept = c(-9, -18), linetype="dotted", color="grey", linewidth = 1) +
  geom_line(linewidth = 1) +
  geom_point() +
  geom_point(data = subset(ccf_data, lag == max_lag), aes(x = lag, y = acf), size = 2) +
  geom_text_repel(data = subset(ccf_data, lag == max_lag), aes(x = lag, y = acf, label = round(max_ccf, 3)), vjust = -1, size = 5.5, family = "Arial", nudge_y = 0.05, show.legend = FALSE) +
  geom_hline(aes(yintercept = -sig_level), linetype = "dashed", color = "blue") +
  geom_hline(aes(yintercept = sig_level), linetype = "dashed", color = "blue") +
  labs(x = "Lag (months)",
       y = "Cross-correlation") +
  scale_x_continuous(breaks = c(0, -9, -18), labels = c("0", "-9", "-18")) +
  theme_minimal(base_size = 14) +
  theme(
    plot.background = element_rect(fill = "white"),
    plot.title = element_text(hjust = 0.5, size = 14, family = "Arial"),
    plot.title.position = "plot",
    axis.text = element_text(color = "black", size = 14, family = "Arial"),
    axis.title = element_text(color = "black", size = 14, family = "Arial"),
    axis.ticks = element_line(linewidth = 1),
    axis.line = element_line(linewidth = 0),
    panel.border = element_rect(fill = NA, color = "black", linewidth = 1),
    strip.text = element_text(color = "black", size = 14, family = "Arial"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.text = element_text(family = "Arial", size = 12),
    legend.title = element_text(family = "Arial", size = 14)
  ) +
  scale_color_manual(values = wesanderson::wes_palette("Zissou1")[c(4, 1)]) +
  facet_wrap(~data_set, ncol=2)

ggsave(filename = "my_plot.png", plot = my_plot, width = 8, height = 5, dpi = 300)



## Linear modelling

In [4]:
library(MASS)

# Define a function to find the best model for a given dataset
find_best_model <- function(data) {
  # Convert your variables to time series objects
  light.ts <- ts(data$light)
  temp.ts <- ts(data$temp)
  BR.ts <- ts(data$BR)

  # Create lagged variables for light and temp
lightlag9 <- lag(light.ts, -9)
lightlag10 <- lag(light.ts, -10)
lightlag11 <- lag(light.ts, -11)

templag9 <- lag(temp.ts, -9)
templag10 <- lag(temp.ts, -10)
templag11 <- lag(temp.ts, -11)

# Create data frame with all variables
alldata <- ts.intersect(BR.ts, lightlag9, lightlag10, lightlag11, templag9, templag10, templag11)

# Create a list of all possible combinations of the three lags for both light and temp
lag_combinations <- list(
  list(var = "light", lags = c("lightlag9")),
  list(var = "light", lags = c("lightlag10")),
  list(var = "light", lags = c("lightlag11")),
  list(var = "light", lags = c("lightlag9", "lightlag10")),
  list(var = "light", lags = c("lightlag9", "lightlag11")),
  list(var = "light", lags = c("lightlag10", "lightlag11")),
  list(var = "light", lags = c("lightlag9", "lightlag10", "lightlag11")),
  list(var = "temp", lags = c("templag9")),
  list(var = "temp", lags = c("templag10")),
  list(var = "temp", lags = c("templag11")),
  list(var = "temp", lags = c("templag9", "templag10")),
  list(var = "temp", lags = c("templag9", "templag11")),
  list(var = "temp", lags = c("templag10", "templag11")),
  list(var = "temp", lags = c("templag9", "templag10", "templag11"))
)

# Initialize variables to store the best model and the highest adjusted R-squared value
best_model <- NULL
highest_adj_r_squared <- 0

# Loop through all combinations to find the best model
for (combination in lag_combinations) {
  # Create a formula for the current combination
  formula <- as.formula(paste("BR.ts ~", paste(combination$lags, collapse = " + ")))

  # Fit the model
  model <- lm(formula, data = alldata)

  # Get the adjusted R-squared value
  adj_r_squared <- summary(model)$adj.r.squared

  # Check if this model is better than the previous best model
  if (adj_r_squared > highest_adj_r_squared) {
    # Update the best model and the highest adjusted R-squared value
    best_model <- model
    highest_adj_r_squared <- adj_r_squared
  }
}
  # Return the summary of the best model
  return(summary(best_model))
}

# Find and print the best model for each dataset already loaded
best_model_data1 <- find_best_model(data)
best_model_data2 <- find_best_model(data2)

# Print the summary of the best models
print("Summary of the best model for data1:")
print(best_model_data1)

print("Summary of the best model for data2:")
print(best_model_data2)


[1] "Summary of the best model for data1:"

Call:
lm(formula = formula, data = alldata)

Residuals:
       Min         1Q     Median         3Q        Max 
-0.0015175 -0.0004227  0.0001014  0.0004552  0.0010924 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 5.548e-03  8.946e-05  62.018  < 2e-16 ***
lightlag10  1.600e-06  9.880e-07   1.619 0.106649    
lightlag11  3.497e-06  9.866e-07   3.545 0.000469 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.0005788 on 250 degrees of freedom
Multiple R-squared:  0.1342,	Adjusted R-squared:  0.1272 
F-statistic: 19.37 on 2 and 250 DF,  p-value: 1.512e-08

[1] "Summary of the best model for data2:"

Call:
lm(formula = formula, data = alldata)

Residuals:
       Min         1Q     Median         3Q        Max 
-7.218e-04 -1.820e-04  1.633e-05  1.794e-04  5.683e-04 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.337e-03  3.305e-

If you are unfamiliar with linear models then please read on for a breakdown of the output from the code block above:

### Call
```
Call:
lm(formula = formula, data = alldata)
```
This section simply repeats the function call that was used to fit the model, with the formula and data arguments specified.

### Residuals
```
Residuals:
       Min         1Q     Median         3Q        Max
-0.0015175 -0.0004227  0.0001014  0.0004552  0.0010924
```
Here, the "Residuals" section provides a summary of the residuals (the differences between the observed and predicted values). The summary includes the minimum, 1st quartile (25th percentile), median (50th percentile), 3rd quartile (75th percentile), and maximum residual values. You ideally want the residuals to be symmetrically distributed around zero.

### Coefficients
```
Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 5.548e-03  8.946e-05  62.018  < 2e-16 ***
lightlag10  1.600e-06  9.880e-07   1.619 0.106649    
lightlag11  3.497e-06  9.866e-07   3.545 0.000469 ***
```
The "Coefficients" section contains the estimates of the parameters (intercept and the slopes), their standard errors, t-values, and p-values. Here are the details for each column:

1. **Estimate**: The estimated value of the coefficients.
   - **(Intercept)**: 5.548e-03 is the estimated value of the intercept.
   - **lightlag10**: 1.600e-06 is the estimated effect of the `lightlag10` variable on `BR.ts`.
   - **lightlag11**: 3.497e-06 is the estimated effect of the `lightlag11` variable on `BR.ts`.
   
2. **Std. Error**: The standard error of the estimated coefficients. It represents the accuracy of the coefficients; the lower, the better.
   
3. **t value**: The t-value is the ratio of the estimate to its standard error. Larger absolute values of the t-statistic indicate higher significance.
   
4. **Pr(>|t|)**: The p-value associated with the t-statistic. It tests the null hypothesis that the coefficient is equal to zero (no effect). A low p-value (< 0.05) indicates you can reject the null hypothesis. Here:
   - For **lightlag10**, the p-value is 0.106649, which is not significant at a 0.05 level.
   - For **lightlag11**, the p-value is 0.000469, which is significant at a 0.05 level (indicated by `***`).

### Additional Statistics
```
Residual standard error: 0.0005788 on 250 degrees of freedom
Multiple R-squared:  0.1342,	Adjusted R-squared:  0.1272
F-statistic: 19.37 on 2 and 250 DF,  p-value: 1.512e-08
```
- **Residual standard error**: This is a measure of the quality of the linear regression fit; lower values indicate a better fit.
   
- **Multiple R-squared**: This is a statistical measure of how close the data are to the fitted regression line; a value of 0.1342 means about 13.42% of the variability in `BR.ts` can be explained by the model.
   
- **Adjusted R-squared**: It is a corrected version of R-squared that has been adjusted for the number of predictors in the model; it's more appropriate for comparing different models.
   
- **F-statistic and p-value**: These are used to test the overall significance of the model. A low p-value here (1.512e-08) suggests that the model is statistically significant (at least one of the predictor variables has a non-zero coefficient).