# Logistic regression submission

## Notebook set up

### Imports

In [1]:
import joblib
import pandas as pd
import urllib.request
from pathlib import Path


### Run configuration

In [2]:
# Flag to control file paths for Kaggle vs other environments
KAGGLE = False

## 1. Asset loading

In [3]:
# Set file paths based on environment
if KAGGLE:

    # Kaggle paths - data is in /kaggle/input/
    test_df_path = '/kaggle/input/playground-series-s5e12/test.csv'
    model_path = ''  # Fill in with Kaggle dataset path

else:
    # Otherwise, load from GitHub
    test_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_test.csv'
    model_url = 'https://github.com/gperdrizet/diabetes-prediction/raw/refs/heads/main/models/logistic_regression.joblib'
    
    # Download model to temporary location
    model_path = Path('logistic_regression.joblib')
    urllib.request.urlretrieve(model_url, model_path)
    print(f'Model downloaded to {model_path}')

# Load the testing dataset
test_df = pd.read_csv(test_df_path)

# Load the model
model = joblib.load(model_path)

# Display first few rows of training data
test_df.head()


Model downloaded to logistic_regression.joblib


Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,...,111,Female,White,Highschool,Middle,Former,Employed,0,0,0
1,700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,...,145,Female,White,Highschool,Middle,Never,Unemployed,0,0,0
2,700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,...,184,Male,White,Highschool,Low,Never,Employed,0,0,0
3,700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,...,128,Male,White,Graduate,Middle,Former,Employed,0,0,0
4,700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,...,133,Male,White,Graduate,Low,Current,Unemployed,0,0,0


In [4]:
model

0,1,2
,steps,"[('preprocessor', ...), ('poly', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,iqr_multiplier,np.float64(2.4993214225467555)

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,"[['No formal', 'Highschool', ...], ['Low', 'Lower-Middle', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,degree,1
,interaction_only,True
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_components,18
,copy,True
,whiten,True
,svd_solver,'randomized'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(0.1304563673289262)
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


## 2. Inference

In [5]:
predictions_df = pd.DataFrame({
    'id': test_df['id'].astype(int),
    'diagnosed_diabetes': model.predict(test_df).astype(int)
})

predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   id                  300000 non-null  int64
 1   diagnosed_diabetes  300000 non-null  int64
dtypes: int64(2)
memory usage: 4.6 MB


## 3. Save submission file

In [6]:
# Set submission file path based on environment
if KAGGLE:
    submission_path = Path('submission.csv')

else:
    # Create data directory if it doesn't exist
    data_dir = Path('../data')
    data_dir.mkdir(parents=True, exist_ok=True)
    submission_path = data_dir / 'logistic_regression_submission.csv'

# Save submission file
predictions_df.to_csv(submission_path, index=False)
print(f'Submission saved to: {submission_path}')


Submission saved to: ../data/logistic_regression_submission.csv


## 4. Clean up

In [7]:
# Clean up downloaded model file if not on Kaggle
if not KAGGLE and model_path.exists():

    model_path.unlink()
    print(f'Cleaned up temporary model file: {model_path}')

Cleaned up temporary model file: logistic_regression.joblib
