In [125]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [126]:
# https://www.statsmodels.org/stable/index.html
import statsmodels.api as sm

In [127]:
import warnings
warnings.filterwarnings("ignore")

In [128]:
from functools import partial

In [129]:
from dotenv import load_dotenv
from pathlib import Path
import os
import sys

# Load environment variables
env_path = Path("../../.env-live")
load_dotenv(dotenv_path=env_path, override=True)

# Print all relevant environment variables for debugging
print("Environment Variables Configuration:")
print(f"S3 Region: {os.getenv('JRJ_MODEL_REGISTRY_S3_REGION')}")
print(f"S3 Endpoint: {os.getenv('JRJ_MODEL_REGISTRY_S3_ENDPOINT')}")
print(f"S3 Bucket: {os.getenv('JRJ_MODEL_REGISTRY_S3_BUCKET_NAME')}")
print(f"MongoDB Connection: {os.getenv('JRJ_MODEL_REGISTRY_MONGODB_CONNECTION_STRING')}")
print(f"MongoDB Database: {os.getenv('JRJ_MODEL_REGISTRY_MONGODB_DATABASE')}")
print(f"MongoDB Collection: {os.getenv('JRJ_MODEL_REGISTRY_MONGODB_COLLECTION')}")

Environment Variables Configuration:
S3 Region: ca-central-1
S3 Endpoint: s3.ca-central-1.wasabisys.com/273-g1
S3 Bucket: 273-g1
MongoDB Connection: mongodb://localhost:27017
MongoDB Database: model_registry
MongoDB Collection: models


In [130]:
# Verify all required variables exist
required_vars = [
    "JRJ_MODEL_REGISTRY_S3_REGION",
    "JRJ_MODEL_REGISTRY_S3_ENDPOINT",
    "JRJ_MODEL_REGISTRY_S3_BUCKET_NAME",
    "JRJ_MODEL_REGISTRY_MONGODB_CONNECTION_STRING",
    "JRJ_MODEL_REGISTRY_MONGODB_DATABASE",
    "JRJ_MODEL_REGISTRY_MONGODB_COLLECTION"
]

missing_vars = [var for var in required_vars if not os.getenv(var)]
if missing_vars:
    print(f"\n❌ MISSING ENVIRONMENT VARIABLES: {', '.join(missing_vars)}")
    
    # ===== 添加缺失的 MongoDB 配置 =====
    print("\n⚠️ Adding temporary MongoDB configuration for testing...")
    
    # 设置默认的本地 MongoDB 配置
    os.environ["JRJ_MODEL_REGISTRY_MONGODB_CONNECTION_STRING"] = "mongodb://localhost:27017"
    os.environ["JRJ_MODEL_REGISTRY_MONGODB_DATABASE"] = "model_registry"
    os.environ["JRJ_MODEL_REGISTRY_MONGODB_COLLECTION"] = "models"
    
    print("✅ Added default MongoDB configuration:")
    print(f"  Connection: {os.getenv('JRJ_MODEL_REGISTRY_MONGODB_CONNECTION_STRING')}")
    print(f"  Database: {os.getenv('JRJ_MODEL_REGISTRY_MONGODB_DATABASE')}")
    print(f"  Collection: {os.getenv('JRJ_MODEL_REGISTRY_MONGODB_COLLECTION')}")

In [131]:
# Clear module cache to force reload
if 'jrjModelRegistry.jrjModelRegistry' in sys.modules:
    del sys.modules['jrjModelRegistry.jrjModelRegistry']

from jrjModelRegistry.jrjModelRegistry import registerAJrjModel, jrjModelRegistryConfig

In [132]:
jrjModelRegistryConfig.update({
    # S3 Configuration
    "s3Region": os.getenv("JRJ_MODEL_REGISTRY_S3_REGION", "ca-central-1"),
    "s3Endpoint": os.getenv("JRJ_MODEL_REGISTRY_S3_ENDPOINT", ""),
    "s3KeyId": os.getenv("JRJ_MODEL_REGISTRY_S3_KEY_ID", ""),
    "s3KeySecret": os.getenv("JRJ_MODEL_REGISTRY_S3_KEY_SECRET", ""),
    "s3BucketName": os.getenv("JRJ_MODEL_REGISTRY_S3_BUCKET_NAME", ""),
    
    # MongoDB Configuration
    "mongodbConnectionString": os.getenv("JRJ_MODEL_REGISTRY_MONGODB_CONNECTION_STRING", ""),
    "mongodbDatabase": os.getenv("JRJ_MODEL_REGISTRY_MONGODB_DATABASE", ""),
    "mongodbCollection": os.getenv("JRJ_MODEL_REGISTRY_MONGODB_COLLECTION", "")
})

In [133]:
print("\nUpdated Package Configuration:")
print(f"S3 Region: {jrjModelRegistryConfig.get('s3Region')}")
print(f"S3 Endpoint: {jrjModelRegistryConfig.get('s3Endpoint')}")
print(f"S3 Bucket: {jrjModelRegistryConfig.get('s3BucketName')}")
print(f"MongoDB Database: {jrjModelRegistryConfig.get('mongodbDatabase')}")
print(f"MongoDB Collection: {jrjModelRegistryConfig.get('mongodbCollection')}")


Updated Package Configuration:
S3 Region: ca-central-1
S3 Endpoint: s3.ca-central-1.wasabisys.com/273-g1
S3 Bucket: 273-g1
MongoDB Database: model_registry
MongoDB Collection: models


In [134]:
salaryDf = pd.read_csv("https://www.dropbox.com/scl/fi/xwirjv3wflfl94qckcbqw/salary_cleaned.csv?rlkey=8w9zgs8psc6g775hb2b7uvv74&st=vj4q42nr&dl=1")
salaryDf.head()

Unnamed: 0,Salary,Age,StockOptionLevel,YearsAtCompany,EducationLevel,EnvironmentSatisfaction,TrainingOpportunitiesWithinYear,TrainingOpportunitiesTaken,SelfRating,ManagerRating
0,102059,30,1,10,5,3.0,3.0,0.0,3.0,3.0
1,102059,30,1,10,5,4.0,3.0,1.0,3.0,2.0
2,102059,30,1,10,5,5.0,3.0,0.0,5.0,5.0
3,102059,30,1,10,5,1.0,3.0,1.0,5.0,4.0
4,102059,30,1,10,5,3.0,1.0,0.0,4.0,3.0


In [135]:
salaryDf.shape

(6899, 10)

In [136]:
salaryDf.describe()

Unnamed: 0,Salary,Age,StockOptionLevel,YearsAtCompany,EnvironmentSatisfaction,TrainingOpportunitiesWithinYear,TrainingOpportunitiesTaken,SelfRating,ManagerRating
count,6899.0,6899.0,6899.0,6899.0,6899.0,6899.0,6899.0,6899.0,6899.0
mean,110898.374112,30.604146,0.725467,5.578055,3.876069,2.012611,1.016814,3.984491,3.460357
std,98427.862382,7.986542,0.839724,3.410087,0.92789,0.808937,0.937141,0.805114,0.95156
min,20387.0,18.0,0.0,0.0,1.0,1.0,0.0,3.0,2.0
25%,44646.0,25.0,0.0,3.0,3.0,1.0,0.0,3.0,3.0
50%,74458.0,28.0,1.0,6.0,4.0,2.0,1.0,4.0,3.0
75%,137219.5,36.0,1.0,9.0,5.0,3.0,2.0,5.0,4.0
max,547204.0,51.0,3.0,10.0,5.0,3.0,3.0,5.0,5.0


In [137]:
sm.add_constant(salaryDf[[
    "Salary", 
    "Age", 
    "StockOptionLevel", 
    "YearsAtCompany", 
    "EnvironmentSatisfaction", 
    "TrainingOpportunitiesWithinYear", 
    "TrainingOpportunitiesTaken", 
    "SelfRating", 
    "ManagerRating"
]])


Unnamed: 0,const,Salary,Age,StockOptionLevel,YearsAtCompany,EnvironmentSatisfaction,TrainingOpportunitiesWithinYear,TrainingOpportunitiesTaken,SelfRating,ManagerRating
0,1.0,102059,30,1,10,3.0,3.0,0.0,3.0,3.0
1,1.0,102059,30,1,10,4.0,3.0,1.0,3.0,2.0
2,1.0,102059,30,1,10,5.0,3.0,0.0,5.0,5.0
3,1.0,102059,30,1,10,1.0,3.0,1.0,5.0,4.0
4,1.0,102059,30,1,10,3.0,1.0,0.0,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...
6894,1.0,38508,20,0,0,4.0,2.0,1.0,4.0,3.0
6895,1.0,92995,27,3,0,4.0,2.0,1.0,4.0,3.0
6896,1.0,63375,21,0,0,4.0,2.0,1.0,4.0,3.0
6897,1.0,46521,21,0,0,4.0,2.0,1.0,4.0,3.0


In [138]:
X = salaryDf[[
    "Age", 
    "StockOptionLevel", 
    "YearsAtCompany", 
    "EnvironmentSatisfaction", 
    "TrainingOpportunitiesWithinYear", 
    "TrainingOpportunitiesTaken", 
    "SelfRating", 
    "ManagerRating"
]]

y = salaryDf["Salary"]

X_const = sm.add_constant(X)
salaryRegModel1Fit = sm.OLS(y, X_const).fit()

print(salaryRegModel1.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.187
Model:                            OLS   Adj. R-squared:                  0.186
Method:                 Least Squares   F-statistic:                     198.4
Date:                Thu, 26 Jun 2025   Prob (F-statistic):          3.03e-303
Time:                        14:02:13   Log-Likelihood:                -88392.
No. Observations:                6899   AIC:                         1.768e+05
Df Residuals:                    6890   BIC:                         1.769e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

In [139]:
salaryDf.head()

Unnamed: 0,Salary,Age,StockOptionLevel,YearsAtCompany,EducationLevel,EnvironmentSatisfaction,TrainingOpportunitiesWithinYear,TrainingOpportunitiesTaken,SelfRating,ManagerRating
0,102059,30,1,10,5,3.0,3.0,0.0,3.0,3.0
1,102059,30,1,10,5,4.0,3.0,1.0,3.0,2.0
2,102059,30,1,10,5,5.0,3.0,0.0,5.0,5.0
3,102059,30,1,10,5,1.0,3.0,1.0,5.0,4.0
4,102059,30,1,10,5,3.0,1.0,0.0,4.0,3.0


In [140]:
from functools import partial

# [YOUR MODEL DEFINITION - KEEP UNCHANGED]
salaryRegModel1SampleData = {
    "Age": [30, 30, 30],
    "StockOptionLevel": [1, 1, 1],  
    "YearsAtCompany": [5, 5, 5],
    'EnvironmentSatisfaction': [3.0, 5.0, 3.0],
    'TrainingOpportunitiesWithinYear': [3.0, 3.0, 1.0],
    'TrainingOpportunitiesTaken': [0.0, 0.0, 0.0],
    'SelfRating': [3.0, 5.0, 4.0],
    'ManagerRating': [3.0, 5.0, 3.0]
}

def salaryRegModel1Transformer(dataForTransfer=None):
    import pandas as pd
    import statsmodels.api as sm
    df = pd.DataFrame(dataForTransfer)
    df = sm.add_constant(df[[
        'Age',
        'StockOptionLevel',
        'YearsAtCompany',
        'EnvironmentSatisfaction',
        'TrainingOpportunitiesWithinYear',
        'TrainingOpportunitiesTaken',
        'SelfRating',
        'ManagerRating'
    ]])
    return df

def salaryRegModel1MainPredictor(self, transformedData):
    return self.predict(transformedData)

# Attach methods to model
salaryRegModel1Fit.mainPredictor = partial(salaryRegModel1MainPredictor, salaryRegModel1Fit)
salaryRegModel1Fit.transformer = salaryRegModel1Transformer
salaryRegModel1Fit.sampleInput = salaryRegModel1SampleData

In [141]:
salaryRegModel1Fit.version = "1.0.0"

# 8. Test MongoDB connection
print("\nTesting MongoDB connection...")
try:
    from pymongo import MongoClient
    from urllib.parse import quote_plus
    
    # Handle special characters in password
    conn_str = jrjModelRegistryConfig.get("mongodbConnectionString")
    if '@' in conn_str:
        user_pass, host_part = conn_str.split('@', 1)
        user_pass = quote_plus(user_pass)
        safe_conn_str = f"{user_pass}@{host_part}"
    else:
        safe_conn_str = conn_str
        
    client = MongoClient(safe_conn_str, serverSelectionTimeoutMS=5000)
    db = client[jrjModelRegistryConfig.get("mongodbDatabase")]
    collection = db[jrjModelRegistryConfig.get("mongodbCollection")]
    collection.find_one()  # Simple operation to test connection
    print(f"✅ MongoDB connection successful! Collection: {collection.name}")
    print(f"Document count: {collection.estimated_document_count()}")
except Exception as e:
    print(f"❌ MongoDB connection failed: {str(e)}")


Testing MongoDB connection...
❌ MongoDB connection failed: localhost:27017: [Errno 61] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 5.0s, Topology Description: <TopologyDescription id: 685ce2655af710991c737deb, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 61] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


In [143]:
salaryRegModel1Fit.transformer = salaryRegModel1Transformer

In [144]:
test1 = salaryRegModel1Fit.transformer(salaryRegModel1SampleData)
test1

Unnamed: 0,Age,StockOptionLevel,YearsAtCompany,EnvironmentSatisfaction,TrainingOpportunitiesWithinYear,TrainingOpportunitiesTaken,SelfRating,ManagerRating
0,30,1,5,3.0,3.0,0.0,3.0,3.0
1,30,1,5,5.0,3.0,0.0,5.0,5.0
2,30,1,5,3.0,1.0,0.0,4.0,3.0


In [145]:
salaryRegModel1Fit.mainPredictor(test1)

ValueError: shapes (3,8) and (9,) not aligned: 8 (dim 1) != 9 (dim 0)