In [3]:
%%pyspark
blob_account_name = "psychologyinsights"
blob_container_name = "psychologyinsightsfiles"
from pyspark.sql import SparkSession

sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary
blob_sas_token = token_library.getConnectionString("PsyBlobStorage")

spark.conf.set(
    'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
    blob_sas_token)
df = spark.read.load('wasbs://insightfilesystem@insightaccount.blob.core.windows.net/mypersonality_final.csv', format='csv'
, header=True,quote='\"',escape='\"'
)
display(df.limit(10))

StatementMeta(insightspool, 3, 2, Finished, Available)

SynapseWidget(Synapse.DataFrame, b96c5440-a4fc-4581-b69c-57c1d7caec5d)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import joblib
from azure.storage.blob import BlobServiceClient

df = df.toPandas()
feature_col = 'STATUS'
target_cols = ['sEXT', 'sNEU', 'sAGR', 'sCON', 'sOPN']

# Train and save the best model for each target column
for trait in target_cols:
    # Extract features and labels
    X = df[feature_col].values
    y = df[trait].values

    # Feature extraction
    vectorizer = TfidfVectorizer(stop_words='english', strip_accents='ascii')
    X_tfidf = vectorizer.fit_transform(X)

    # Define the ensemble learning model
    ensemble = GradientBoostingRegressor()

    # Define the parameter grid for grid search
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.1, 0.05, 0.01],
        'max_depth': [3, 4, 5]
    }

    # Perform grid search with cross-validation to find the best model and parameters
    grid_search = GridSearchCV(ensemble, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_tfidf, y)

    # Get the best model and parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Train the best model
    best_model.fit(X_tfidf, y)

    # Save the best model to local storage
    model_name = f"best_model_{trait}.joblib"
    joblib.dump(best_model, model_name)
    blob_service_client = BlobServiceClient(account_url=f"https://{blob_account_name}.blob.core.windows.net",
                                        credential=blob_sas_token)

    blob_client = blob_service_client.get_blob_client(container=blob_container_name, blob=model_name)
    with open(model_name, "rb") as file:
        blob_client.upload_blob(file)

    print("Model file uploaded to Blob storage.")

StatementMeta(insightspool, 3, 12, Finished, Available)

Model file uploaded to Blob storage.
Model file uploaded to Blob storage.
Model file uploaded to Blob storage.
Model file uploaded to Blob storage.
Model file uploaded to Blob storage.
