In [34]:
import joblib
import pandas as pd
import numpy as np
import json
from pprint import pprint


In [35]:
reg_model = joblib.load("../predict/models/reg_model.pkl")
class_model = joblib.load("../predict/models/class_model.pkl")
input_features = joblib.load("../predict/features/input_features.pkl")
num_out_features = joblib.load("../predict/features/num_out_features.pkl")
cat_out_features = joblib.load("../predict/features/cat_out_features.pkl")

In [36]:
print(reg_model)
print(type(reg_model))

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'high_school_gpa', 'sat_score', 'university_ranking',
       'university_gpa', 'internships', 'projects', 'certifications',
       'soft_skills_score', 'networking_score', 'job_offers'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder())]),
                                                  Index(['gender', 'field_of_study', 'current_job_level'], dtype='object'))])),
                ('pca', PCA(n_components=0.95)),
                ('regressor',
                 MultiOutputRegressor(esti

In [37]:
print(class_model)
print(type(class_model))

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'high_school_gpa', 'sat_score', 'university_ranking',
       'university_gpa', 'internships', 'projects', 'certifications',
       'soft_skills_score', 'networking_score', 'job_offers'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder())]),
                                                  Index(['gender', 'field_of_study', 'current_job_level'], dtype='object'))])),
                ('pca', PCA(n_components=0.95)),
                ('classifier', RandomForestClassifier(random_state=1134)

In [38]:
print(input_features)
print(type(input_features))

['age', 'gender', 'high_school_gpa', 'sat_score', 'university_ranking', 'university_gpa', 'field_of_study', 'internships', 'projects', 'certifications', 'soft_skills_score', 'networking_score', 'job_offers', 'current_job_level']
<class 'list'>


In [39]:
print(num_out_features)
print(type(num_out_features))

['starting_salary', 'career_satisfaction', 'years_to_promotion', 'work_life_balance']
<class 'list'>


In [40]:
print(cat_out_features)
print(type(cat_out_features))

['entrepreneurship']
<class 'list'>


In [41]:
data = {
    "age": 23,
    "gender": "Male",
    "high_school_gpa": 0.8,
    "sat_score": 0.914,
    "university_ranking": 0.23,
    "university_gpa": 0.85,
    "field_of_study": "Computer Science",
    "internships": 2,
    "projects": 5,
    "certifications": 3,
    "soft_skills_score": 7.8,
    "networking_score": 6.5,
    "job_offers": 1,
    "current_job_level": "Entry",
}

In [None]:
with open("../predict/features/feature_types.json") as f:
    feature_types = json.load(f)

type_map = {
    "int": int,
    "float": float,
    "str": str,
    "bool": bool
}

processed_data = []
for feature in input_features:
    value = data[feature]
    data_type = type_map[feature_types[feature]]
    print(data_type)
    processed_data.append(data_type(value))

input_df = pd.DataFrame([processed_data], columns=input_features)


<class 'int'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'float'>
<class 'float'>
<class 'int'>
<class 'str'>


In [43]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 1 non-null      int64  
 1   gender              1 non-null      object 
 2   high_school_gpa     1 non-null      float64
 3   sat_score           1 non-null      float64
 4   university_ranking  1 non-null      float64
 5   university_gpa      1 non-null      float64
 6   field_of_study      1 non-null      object 
 7   internships         1 non-null      int64  
 8   projects            1 non-null      int64  
 9   certifications      1 non-null      int64  
 10  soft_skills_score   1 non-null      float64
 11  networking_score    1 non-null      float64
 12  job_offers          1 non-null      int64  
 13  current_job_level   1 non-null      object 
dtypes: float64(6), int64(5), object(3)
memory usage: 240.0+ bytes


In [44]:
reg_pred = reg_model.predict(input_df)
print(reg_pred)
print(type(reg_pred))

[[5.06942208e+04 5.61075000e+00 3.01625000e+00 5.48675000e+00]]
<class 'numpy.ndarray'>


In [45]:
class_pred = class_model.predict(input_df).reshape(1, -1)
print(class_pred)
print(type(class_pred))

[['No']]
<class 'numpy.ndarray'>


In [46]:
res = {}
for i, feature in enumerate(num_out_features):
    res[feature] = reg_pred[0][i]
for i, feature in enumerate(cat_out_features):
    res[feature] = class_pred[i]

# res = clean(res)

pprint(res)


{'career_satisfaction': np.float64(5.61075),
 'entrepreneurship': array(['No'], dtype=object),
 'starting_salary': np.float64(50694.220813800304),
 'work_life_balance': np.float64(5.48675),
 'years_to_promotion': np.float64(3.01625)}


In [47]:
def clean(obj):
    # convertion to standard data types from numpy datatypes
    for k, v in obj.items():
        obj[k] = round(v.item()) if type(v.item()) == float else v.item()

In [48]:
clean(res)
pprint(res)

{'career_satisfaction': 6,
 'entrepreneurship': 'No',
 'starting_salary': 50694,
 'work_life_balance': 5,
 'years_to_promotion': 3}
