In [71]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset from the link
url = 'https://raw.githubusercontent.com/pointOfive/stat130chat130/refs/heads/main/CP/CSCS_data_anon.csv'
df = pd.read_csv(url, low_memory=False)

# Filter relevant columns and clean data
filtered_df = df[['WELLNESS_life_satisfaction', 'DEMO_age', 'DEMO_gender', 'FINANCES_money_concerned']]
filtered_df = filtered_df.replace('Presented but no response', pd.NA)
filtered_df = filtered_df.replace(
    'Non-binary (including those identifying as Two Spirit, agender, genderfluid, genderqueer, or with another term)',
    'Non-binary'
)
filtered_df = filtered_df.dropna()

# One-hot encode categorical variables
filtered_df = pd.get_dummies(filtered_df, columns=['DEMO_gender', 'FINANCES_money_concerned'])

# Create the interaction term between 'DEMO_age' and 'FINANCES_money_concerned'
filtered_df['DEMO_age_FINANCES_interaction'] = filtered_df['DEMO_age'] * filtered_df['FINANCES_money_concerned_Somewhat']
filtered_df['DEMO_age_FINANCES_Very_Little'] = filtered_df['DEMO_age'] * filtered_df['FINANCES_money_concerned_Not at all']
filtered_df['DEMO_age_FINANCES_Very_Well'] = filtered_df['DEMO_age'] * filtered_df['FINANCES_money_concerned_Very Well']

# Convert all boolean columns to 0 and 1 (True becomes 1, False becomes 0)
# This will convert the interaction term into numerical columns
filtered_df = filtered_df.applymap(lambda x: 1 if x is True else (0 if x is False else x))

# Split data into training and testing sets
X = filtered_df.drop('WELLNESS_life_satisfaction', axis=1)
y = filtered_df['WELLNESS_life_satisfaction']

# Add a constant to the model (this is necessary for statsmodels to calculate the intercept)
X_with_const = sm.add_constant(X)

# Fit the model using statsmodels OLS regression
model = sm.OLS(y, X_with_const)
results = model.fit()

# Train the model using scikit-learn for the original RMSE and R² metrics (optional)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

# Train the linear regression model
sklearn_model = LinearRegression()
sklearn_model.fit(X_train, y_train)

# Make predictions on both training and test sets
y_train_pred = sklearn_model.predict(X_train)
y_test_pred = sklearn_model.predict(X_test)

# Evaluate the model on training and test sets
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
r2_train = r2_score(y_train, y_train_pred)

rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2_test = r2_score(y_test, y_test_pred)

print(f'Testing RMSE: {rmse_test}')
print(f'Testing R²: {r2_test}')

# Print the summary of the model, which includes p-values and other statistics
print(results.summary())

Testing RMSE: 2.210255538889815
Testing R²: 0.09115617757965822
                                OLS Regression Results                                
Dep. Variable:     WELLNESS_life_satisfaction   R-squared:                       0.078
Model:                                    OLS   Adj. R-squared:                  0.073
Method:                         Least Squares   F-statistic:                     15.78
Date:                        Thu, 28 Nov 2024   Prob (F-statistic):           1.55e-27
Time:                                01:47:44   Log-Likelihood:                -4208.8
No. Observations:                        1887   AIC:                             8440.
Df Residuals:                            1876   BIC:                             8501.
Df Model:                                  10                                         
Covariance Type:                    nonrobust                                         
                                           coef    std err        

  filtered_df = filtered_df.applymap(lambda x: 1 if x is True else (0 if x is False else x))


I made a multiple variable linear regression model to model WELLNESS_life_satisfaction based on variables 'DEMO_age', 'DEMO_gender', 'FINANCES_money_concerned'. I also used an OLS regression model to find P values for each independent variable.