<a href="https://colab.research.google.com/github/francji1/01NAEX/blob/main/code/01NAEX_Lecture11_Longitudinal_data_under_construction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#

http://www.sortie-nd.org/lme/R%20Tutorials/Pinheiro%20-%20Mixed%20Models%20in%20S.pdf

https://www.crumplab.com/psyc7709_2019/book/docs/a-tutorial-for-using-the-lme-function-from-the-nlme-package-.**html**

In [None]:
! pip install rpy2

In [None]:
import rpy2
import numpy as np
import pandas as pd

# Set seed for reproducibility
np.random.seed(42)

# Parameters
n_subjects = 100
n_counties = 10
n_times = 5

# Fixed effects coefficients
beta_gender = 1.5
beta_age = -0.05

# Random effects variances and covariance
sigma_subject_intercept = 1.0
sigma_subject_slope = 0.5
rho_subject = 0.3  # Correlation between intercept and slope
sigma_county = 0.7

# AR(1) autocorrelation coefficient
phi = 0.6
sigma_epsilon = 1.0  # Standard deviation of residuals

# Generate subjects
subject_ids = np.arange(n_subjects)
genders = np.random.binomial(1, 0.5, size=n_subjects)  # 0 or 1
ages = np.random.normal(50, 10, size=n_subjects)  # Mean age 50, SD 10
counties = np.random.choice(np.arange(n_counties), size=n_subjects)

# Create a DataFrame to hold the data
data = pd.DataFrame({
    'subject_id': np.repeat(subject_ids, n_times),
    'gender': np.repeat(genders, n_times),
    'age': np.repeat(ages, n_times),
    'county': np.repeat(counties, n_times)
})

# Generate time variable
data['time'] = np.tile(np.arange(n_times), n_subjects)

# Generate random effects for subjects (intercept and slope with correlation)
from scipy.stats import multivariate_normal

cov_subject = np.array([
    [sigma_subject_intercept**2, rho_subject * sigma_subject_intercept * sigma_subject_slope],
    [rho_subject * sigma_subject_intercept * sigma_subject_slope, sigma_subject_slope**2]
])

subject_random_effects = multivariate_normal.rvs(
    mean=[0, 0],
    cov=cov_subject,
    size=n_subjects
)

data['subject_intercept'] = np.repeat(subject_random_effects[:, 0], n_times)
data['subject_slope'] = np.repeat(subject_random_effects[:, 1], n_times)

# Generate random effects for counties (random intercept)
county_random_effects = np.random.normal(0, sigma_county, size=n_counties)
data['county_intercept'] = data['county'].map(dict(zip(np.arange(n_counties), county_random_effects)))

# Simulate AR(1) residuals for each subject
def simulate_ar1_errors(phi, sigma, size):
    epsilons = np.zeros(size)
    epsilons[0] = np.random.normal(0, sigma / np.sqrt(1 - phi**2))
    for t in range(1, size):
        epsilons[t] = phi * epsilons[t - 1] + np.random.normal(0, sigma)
    return epsilons

data['epsilon'] = 0.0

for subject in subject_ids:
    idx = data['subject_id'] == subject
    n_obs = idx.sum()
    epsilons = simulate_ar1_errors(phi, sigma_epsilon, n_obs)
    data.loc[idx, 'epsilon'] = epsilons

# Compute the response variable
data['y'] = (
    beta_gender * data['gender'] +
    beta_age * data['age'] +
    data['subject_intercept'] +
    data['subject_slope'] * data['time'] +
    data['county_intercept'] +
    data['epsilon']
)

# Center age for better model convergence
data['age_centered'] = data['age'] - data['age'].mean()


In [None]:
data.head(10)

In [None]:
data.describe()

In [None]:
# Convert categorical variables to strings
data['subject_id'] = data['subject_id'].astype(str)
data['county'] = data['county'].astype(str)
data['gender'] = data['gender'].astype(str)


In [None]:
# Load  packages
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects import Formula
from rpy2.robjects.packages import importr
from rpy2.robjects import DataFrame as RDataFrame, StrVector, FloatVector, IntVector
from rpy2.robjects.vectors import ListVector

# Activate the automatic conversion between R and pandas DataFrames
pandas2ri.activate()


In [None]:
# Define a function to convert a pandas DataFrame to an R-compatible DataFrame
def pandas_to_r(data):
    columns = {}
    for col_name in data.columns:
        col_data = data[col_name]
        if col_data.dtype == 'object' or col_data.dtype.name == 'category':
            columns[col_name] = StrVector(col_data.astype(str))
        elif np.issubdtype(col_data.dtype, np.integer):
            columns[col_name] = IntVector(col_data)
        else:
            columns[col_name] = FloatVector(col_data)
    return RDataFrame(ListVector(columns))

# Convert the DataFrame to an R-compatible data frame
r_data = pandas_to_r(data)


In [None]:
#r_data

In [None]:
from rpy2.robjects.packages import importr
from rpy2.robjects import r

# Import base and nlme packages
base = importr('base')
nlme = importr('nlme')

In [None]:
# Define the model formula
model_formula = robjects.Formula('y ~ gender + age_centered')
# Define random effects formula
random_effects = robjects.Formula('~ time | subject_id')
# Define correlation structure
correlation = nlme.corAR1(form=robjects.Formula('~ time | subject_id'))


In [None]:
print(r("str(r_data)"))

In [None]:
print(r("summary(r_data)"))

In [None]:
r("r_data$time <- as.numeric(r_data$time)")
print(r("class(r_data$time)"))       # Should return "numeric" or "integer"
print(r("class(r_data$subject_id)"))  # Should return "factor

In [None]:
lme_model = nlme.lme(
    fixed=fixed_formula,
    random=random_formula,
    data=r_data,
    method='REML'
)

In [None]:
# Get the summary
summary = base.summary(lme_model)
print(summary)

# Extract fixed effects
fixed_effects = summary.rx2('tTable')
print('Fixed Effects:')
print(fixed_effects)

# Extract random effects standard deviations
random_effects_std = summary.rx2('stdDev')
print('Random Effects Standard Deviations:')
print(random_effects_std)


In [None]:
stats = importr('stats')

# Define the model formula
model_formula = robjects.Formula('y ~ gender + age_centered')
# Define random effects formula
random_effects = robjects.Formula('~ time | subject_id')
# Define  correlation formula
correlation_formula = nlme.corAR1(form=Formula('~ time | subject_id'))


# Fit the model
lme_model = nlme.lme(
    fixed = fixed_formula,
    random = random_formula,
    correlation = correlation_formula,
    data = r_data,
    method = "REML"
)

# https://stackoverflow.com/questions/74596213/explanation-of-random-term-syntax-in-nlmelme

In [None]:
# Get the summary
summary = base.summary(lme_model)
print(summary)

# Extract fixed effects
fixed_effects = summary.rx2('tTable')
print('Fixed Effects:')
print(fixed_effects)

# Extract random effects standard deviations
random_effects_std = summary.rx2('stdDev')
print('Random Effects Standard Deviations:')
print(random_effects_std)

# Extract autocorrelation parameter
correlation_structure = lme_model.rx2('modelStruct').rx2('corStruct')
phi_estimate = correlation_structure.rx2('coef')[0]
print('Estimated AR(1) Autocorrelation Coefficient (Phi):', phi_estimate)

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
library(nlme)
library(lattice)
data(Orthodont)

# Fit the linear mixed-effects model with AR(1) correlation structure
m_lme <- lme(distance ~ age * Sex,
             random = ~1 + age | Subject,
             correlation = corAR1(form = ~1 | Subject),
             data = Orthodont)

# Display the summary of the model
summary(m_lme)


In [None]:
%%R
# Plot the fitted values versus residuals
plot(m_lme, resid(., type="p") ~ fitted(.) | Sex, abline = 0)

In [None]:
# Import the R packages
nlme = importr('nlme')
lattice = importr('lattice')

# Load the Orthodont data from R into Python
data_env = robjects.globalenv
robjects.r('data(Orthodont)')
orthodont = data_env['Orthodont']

# Convert the R data frame to a pandas DataFrame
orthodont_df = pandas2ri.rpy2py(orthodont)

# Display the first few rows of the data
print(orthodont_df.head())


In [None]:
# Import the R packages
nlme = importr('nlme')
lattice = importr('lattice')

# Load the Orthodont data from R into Python
robjects.r('data(Orthodont)')
orthodont_df = robjects.r('Orthodont')

# Use localconverter to convert the R data frame to pandas DataFrame
#with localconverter(robjects.default_converter + pandas2ri.converter):
#    orthodont_df = robjects.conversion.rpy2py(orthodont_r)


# Display the first few rows of the data
print(orthodont_df.head())


In [None]:
# Define the model formula and random effects
model_formula = Formula('distance ~ age * Sex')
random_effects = Formula('~1 + age | Subject')
correlation_structure = nlme.corAR1(form=Formula('~1 | Subject'))
r('print(head(data(Orthodont)))')

# Fit the linear mixed-effects model
m_lme = nlme.lme(fixed=model_formula,
                 data=Orthodont,
                 random=random_effects,
                 correlation=correlation_structure,
                 method='REML')

# Get the summary of the model
summary = robjects.r.summary(m_lme)
print(summary)


In [None]:
import rpy2.robjects as robjects
from rpy2.robjects import Formula
from rpy2.robjects.packages import importr

# Import the R packages
nlme = importr('nlme')
lattice = importr('lattice')

# Load the Orthodont data into R's global environment
robjects.r('data(Orthodont)')

# Define the model formula and random effects
model_formula = Formula('distance ~ age * Sex')
random_effects = Formula('~1 + age | Subject')
correlation_structure = nlme.corAR1(form=Formula('~1 | Subject'))

# Access the Orthodont data from R's global environment
Orthodont = robjects.globalenv['Orthodont']

# Fit the linear mixed-effects model using the data directly from R
m_lme = nlme.lme(fixed=model_formula,
                 data=Orthodont,
                 random=random_effects,
                 correlation=correlation_structure,
                 method='REML')

# Get the summary of the model
summary = robjects.r.summary(m_lme)
print(summary)


In [None]:
print(robjects.r('str(Orthodont)'))


In [None]:
from rpy2.robjects import pandas2ri
# Deactivate automatic conversion (rpy2 works with pandas df, but Orthodont is R df)
pandas2ri.deactivate()

# Load the Orthodont data into R's global environment
robjects.r('data(Orthodont)')

# Define the model components
fixed_formula = Formula('distance ~ age * Sex')
random_formula = Formula('~ 1 + age | Subject')
correlation_structure = nlme.corAR1(form=Formula('~ 1 | Subject'))

# Access the Orthodont data
Orthodont = robjects.globalenv['Orthodont']

# Fit the linear mixed-effects model
m_lme = nlme.lme(
    fixed=fixed_formula,
    random=random_formula,
    correlation=correlation_structure,
    data=Orthodont,
    method='REML'
)


In [None]:
# Get the summary of the model
summary = robjects.r.summary(m_lme)
print(summary)

In [None]:
%%R
library(nlme)
library(lattice)
data(Orthodont)

# Fit the linear mixed-effects model with AR(1) correlation structure
m_lme <- lme(distance ~ age * Sex,
             random = ~1 + age | Subject,
             correlation = corAR1(form = ~1 | Subject),
             data = Orthodont)

# Display the summary of the model
summary(m_lme)



###Limitations of Python Packages
* statsmodels MixedLM: Supports mixed-effects models with random intercepts and slopes. Does not natively support specifying autocorrelation structures (like AR(1)) in the residuals.

* statsmodels GLSAR: Supports modeling autocorrelation in residuals.
Does not support random effects.

* nlmixed: An emerging package that support such models but is still under development.

In [None]:
import pandas as pd
import statsmodels.formula.api as smf

# Load the Orthodont dataset
data_url = 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/nlme/Orthodont.csv'
data = pd.read_csv(data_url)

# Remove the unnecessary column
data = data.drop(columns=['rownames'])


In [None]:
data

In [None]:
# Convert categorical variables to appropriate types
data['Subject'] = data['Subject'].astype('category')
data['Sex'] = data['Sex'].astype('category')

# Fit the mixed effects model with Random Effects
# Random intercept and slope for age at the Subject level (re_formula="~ age").
model = smf.mixedlm(
    "distance ~ age * Sex",
    data,
    groups=data["Subject"],
    re_formula="~ age"
)
# No Autocorrelation Structure: The model does not account for the AR(1) autocorrelation in residuals.

result = model.fit(reml=True)

# Print the summary
print(result.summary())


In [None]:
import statsmodels.api as sm
import numpy as np

# Sort data by Subject and age
data_sorted = data.sort_values(['Subject', 'age'])

# Create dummy variables for Sex
data_sorted = pd.get_dummies(data_sorted, columns=['Sex'], drop_first=True)

# Create the interaction term
data_sorted['age_Sex_Male'] = data_sorted['age'] * data_sorted['Sex_Male']

# Define the design matrix
exog = sm.add_constant(data_sorted[['age', 'Sex_Male', 'age_Sex_Male']])
endog = data_sorted['distance']

# Initialize the GLSAR model
model_glsar = sm.GLSAR(endog, exog, rho=1)

# Iteratively estimate rho
for i in range(10):
    results_glsar = model_glsar.fit()
    rho, sigma = sm.regression.linear_model.yule_walker(
        results_glsar.resid, order=1, method='mle')
    model_glsar = sm.GLSAR(endog, exog, rho=rho)

# Fit the model
results_glsar = model_glsar.fit()

# Print the summary
print(results_glsar.summary())


In [None]:
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod import families
from statsmodels.genmod.cov_struct import Autoregressive

# Convert 'Sex' and 'Subject' to categorical variables
data['Sex'] = data['Sex'].astype('category')
data['Subject'] = data['Subject'].astype('category')

# Create numerical codes for 'Sex' and 'Subject'
data['Sex_code'] = data['Sex'].cat.codes
data['Subject_code'] = data['Subject'].cat.codes

# Create interaction term
data['age_Sex'] = data['age'] * data['Sex_code']

# Add a constant term for the intercept
data['Intercept'] = 1.0

# Define endog and exog
endog = data['distance']
exog = data[['Intercept', 'age', 'Sex_code', 'age_Sex']]

# Define the group and time variables
groups = data['Subject_code']
time = data['age']

# Specify the AR(1) correlation structure
cov_struct = Autoregressive()

# Define the GEE model
model = GEE(endog, exog, groups=groups, time=time, cov_struct=cov_struct, family=families.Gaussian())

# Fit the model
result = model.fit()

# Print the summary
print(result.summary())

In [None]:
!pip install nlmixed


In [None]:
from nlmixed import NLMixed

# Define the model formula
formula = 'distance ~ b0 + b1 * age + b2 * Sex_Male + b3 * age_Sex_Male + u0[Subject] + u1[Subject] * age'

# Prepare data
data_sorted['Sex_Male'] = data_sorted['Sex_Male'].astype(int)
data_sorted['Subject'] = data_sorted['Subject'].astype(int)

# Define parameters and initial values
init_params = {'b0': 0, 'b1': 0, 'b2': 0, 'b3': 0, 'Var(u0)': 1, 'Var(u1)': 1, 'Cov(u0,u1)': 0}

# Fit the model
model = NLMixed(formula, data_sorted, init_params, group='Subject')

result = model.fit()

# Print the summary
print(result.summary())


In [None]:
import pandas as pd
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr

# Activate pandas conversion
pandas2ri.activate()

# Load the nlme package in R
nlme = importr('nlme')

# Load the Orthodont dataset
r('data(Orthodont)')
df = r('Orthodont')

df


In [None]:
# Rename columns for Leaspy compatibility
df = df.rename(columns={"distance": "measurement", "age": "time", "Subject": "id"})

# Ensure proper data types
df['id'] = df['id'].astype(str)  # IDs must be strings
df['time'] = df['time'].astype(float)  # Time as float
df['measurement'] = df['measurement'].astype(float)  # Measurements as float


In [None]:
pip install leaspy


In [None]:
from leaspy.io.data import LongitudinalDataset
from leaspy.models import LinearModel
from leaspy.inference import MaximumLikelihoodEstimator

# Create a longitudinal dataset
dataset = LongitudinalDataset.from_dataframe(
    dataframe=df,
    id_column="id",
    time_column="time",
    measurement_columns=["measurement"]
)

# Define a Leaspy model (e.g., Linear growth model)
model = LinearModel(dataset.dimension)

# Initialize an estimator
estimator = MaximumLikelihoodEstimator(model, dataset)

# Fit the model
fitted_model = estimator.estimate()


In [None]:
# Predict trajectories for all individuals
predictions = fitted_model.predict(dataset)

# Visualize one individual’s trajectory
import matplotlib.pyplot as plt

individual_id = "M01"  # Replace with an actual ID from the dataset
individual_data = df[df['id'] == individual_id]

plt.scatter(individual_data['time'], individual_data['measurement'], label="Observed")
plt.plot(predictions[individual_id]['time'], predictions[individual_id]['prediction'], label="Predicted")
plt.xlabel("Time")
plt.ylabel("Measurement")
plt.legend()
plt.show()
