In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
import statsmodels.api as sm
import inflection
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('career_recommendation1.csv')
df

Unnamed: 0,Name,Gender,Age,O,C,E,A,N,Skill_1,Skill_2,Skill_3,Education,Interest_1,Interest_2,Interest_3,CareerGoals
0,Rodney Nelson,Female,28,100,71,64,61,55,SQL,Project Management,SQL,Master degree,Travel,Travel,DIY,Business
1,Jeffrey Atkins,Female,26,91,78,60,56,52,Java,Bilingual,Java,Bachelor degree,DIY,Art,Travel,Engineer
2,Diana Carrillo,Male,23,97,83,80,90,68,Bilingual,Bilingual,Microsoft Office,PhD,Art,Teaching,DIY,Teacher
3,Christopher Rodriguez,Female,53,80,100,84,72,72,Project Management,Bilingual,Bilingual,Bachelor degree,Art,Art,Programming,Event Planner
4,John Meyers,Female,31,89,63,75,97,97,Project Management,Project Management,Project Management,Bachelor degree,Teaching,Programming,Programming,Teacher
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Michele Hood,Female,22,77,94,56,52,66,Java,Project Management,Java,PhD,Programming,Teaching,Programming,Teacher
996,Joseph Patterson,Female,22,88,65,51,92,85,Microsoft Office,Financial Planning,Financial Planning,PhD,Art,DIY,Teaching,Accountant
997,Nicholas Campbell,Female,60,84,58,68,91,85,Financial Planning,Bilingual,Financial Planning,Bachelor degree,Teaching,DIY,Teaching,Teacher
998,Jeremiah Miller,Female,21,61,77,97,53,57,SQL,Financial Planning,Project Management,PhD,Teaching,DIY,Programming,Teacher


In [3]:
df.shape

(1000, 16)

In [4]:
df.dtypes

Name           object
Gender         object
Age             int64
O               int64
C               int64
E               int64
A               int64
N               int64
Skill_1        object
Skill_2        object
Skill_3        object
Education      object
Interest_1     object
Interest_2     object
Interest_3     object
CareerGoals    object
dtype: object

In [5]:
print(df.Gender.unique())
print(df.Skill_1.unique())
print(df.Education.unique())
print(df.Interest_1.unique())
print(df.CareerGoals.unique())

['Female' 'Male']
['SQL' 'Java' 'Bilingual' 'Project Management' 'Financial Planning'
 'Microsoft Office']
['Master degree' 'Bachelor degree' 'PhD']
['Travel' 'DIY' 'Art' 'Teaching' 'Programming']
['Business' 'Engineer' 'Teacher' 'Event Planner' 'Accountant' 'Programmer']


Label Encoder to encode categorical variables
One hot encoding will increase dimensionality. Though with label encoding, the nominal categorical features that have more than 2 values may be treated as ordinal features.

In [6]:
df = df.drop('Name', axis=1)

encoder = LabelEncoder()

#label_cat = df.drop(['Age','O','C','E','A','N'], axis=1)

#for i in label_cat:
#    df[i] = encoder.fit_transform(df[i])
    
df

Unnamed: 0,Gender,Age,O,C,E,A,N,Skill_1,Skill_2,Skill_3,Education,Interest_1,Interest_2,Interest_3,CareerGoals
0,Female,28,100,71,64,61,55,SQL,Project Management,SQL,Master degree,Travel,Travel,DIY,Business
1,Female,26,91,78,60,56,52,Java,Bilingual,Java,Bachelor degree,DIY,Art,Travel,Engineer
2,Male,23,97,83,80,90,68,Bilingual,Bilingual,Microsoft Office,PhD,Art,Teaching,DIY,Teacher
3,Female,53,80,100,84,72,72,Project Management,Bilingual,Bilingual,Bachelor degree,Art,Art,Programming,Event Planner
4,Female,31,89,63,75,97,97,Project Management,Project Management,Project Management,Bachelor degree,Teaching,Programming,Programming,Teacher
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Female,22,77,94,56,52,66,Java,Project Management,Java,PhD,Programming,Teaching,Programming,Teacher
996,Female,22,88,65,51,92,85,Microsoft Office,Financial Planning,Financial Planning,PhD,Art,DIY,Teaching,Accountant
997,Female,60,84,58,68,91,85,Financial Planning,Bilingual,Financial Planning,Bachelor degree,Teaching,DIY,Teaching,Teacher
998,Female,21,61,77,97,53,57,SQL,Financial Planning,Project Management,PhD,Teaching,DIY,Programming,Teacher


In [7]:
for i in df:
    df['CareerGoals'] = encoder.fit_transform(df['CareerGoals'])

In [8]:
df = pd.get_dummies(df)

In [9]:
df

Unnamed: 0,Age,O,C,E,A,N,CareerGoals,Gender_Female,Gender_Male,Skill_1_Bilingual,...,Interest_2_Art,Interest_2_DIY,Interest_2_Programming,Interest_2_Teaching,Interest_2_Travel,Interest_3_Art,Interest_3_DIY,Interest_3_Programming,Interest_3_Teaching,Interest_3_Travel
0,28,100,71,64,61,55,1,1,0,0,...,0,0,0,0,1,0,1,0,0,0
1,26,91,78,60,56,52,2,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,23,97,83,80,90,68,5,0,1,1,...,0,0,0,1,0,0,1,0,0,0
3,53,80,100,84,72,72,3,1,0,0,...,1,0,0,0,0,0,0,1,0,0
4,31,89,63,75,97,97,5,1,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,22,77,94,56,52,66,5,1,0,0,...,0,0,0,1,0,0,0,1,0,0
996,22,88,65,51,92,85,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
997,60,84,58,68,91,85,5,1,0,0,...,0,1,0,0,0,0,0,0,1,0
998,21,61,77,97,53,57,5,1,0,0,...,0,1,0,0,0,0,0,1,0,0


In [10]:
num_scale = ['Age','O','C','E','A','N']
X2 = df[num_scale]
df2 = df.drop(columns = num_scale)

scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0,1))
scaler_minmax = scaler_minmax.fit_transform(X2)
minMax_df = pd.DataFrame(scaler_minmax, columns=num_scale)

In [11]:
df2 = pd.DataFrame(df2)
df = pd.concat([minMax_df, df2], axis=1)
X = df.drop(['CareerGoals'], axis=1)
y2 = df['CareerGoals']
y2 = pd.DataFrame(y2)
df = pd.concat([X, y2], axis=1)

df

Unnamed: 0,Age,O,C,E,A,N,Gender_Female,Gender_Male,Skill_1_Bilingual,Skill_1_Financial Planning,...,Interest_2_DIY,Interest_2_Programming,Interest_2_Teaching,Interest_2_Travel,Interest_3_Art,Interest_3_DIY,Interest_3_Programming,Interest_3_Teaching,Interest_3_Travel,CareerGoals
0,0.212766,1.00,0.42,0.28,0.22,0.10,1,0,0,0,...,0,0,0,1,0,1,0,0,0,1
1,0.170213,0.82,0.56,0.20,0.12,0.04,1,0,0,0,...,0,0,0,0,0,0,0,0,1,2
2,0.106383,0.94,0.66,0.60,0.80,0.36,0,1,1,0,...,0,0,1,0,0,1,0,0,0,5
3,0.744681,0.60,1.00,0.68,0.44,0.44,1,0,0,0,...,0,0,0,0,0,0,1,0,0,3
4,0.276596,0.78,0.26,0.50,0.94,0.94,1,0,0,0,...,0,1,0,0,0,0,1,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.085106,0.54,0.88,0.12,0.04,0.32,1,0,0,0,...,0,0,1,0,0,0,1,0,0,5
996,0.085106,0.76,0.30,0.02,0.84,0.70,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
997,0.893617,0.68,0.16,0.36,0.82,0.70,1,0,0,1,...,1,0,0,0,0,0,0,1,0,5
998,0.063830,0.22,0.54,0.94,0.06,0.14,1,0,0,0,...,1,0,0,0,0,0,1,0,0,5


Since the data isn't normally distributed, MinMaxScaler is used to perform feature scaling to scale the numerical variables in the range [0,1].

In [12]:
X = df.drop('CareerGoals', axis=1)
y = np.array(df['CareerGoals'])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

In [13]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

lr_accuracy = accuracy_score(y_test, y_pred)

In [14]:
print("Logistic Regression Model Evaluation")

print(f"Accuracy Score: {lr_accuracy * 100:.2f}%\n")

print("Classification Report: \n")
report = classification_report(y_test, y_pred, output_dict=True)
lr_class_report = pd.DataFrame(report).transpose()
print(lr_class_report)

Logistic Regression Model Evaluation
Accuracy Score: 86.67%

Classification Report: 

              precision    recall  f1-score     support
0              0.985915  0.958904  0.972222   73.000000
1              0.736842  0.700000  0.717949   20.000000
2              0.893617  0.913043  0.903226   46.000000
3              0.625000  0.810811  0.705882   37.000000
4              0.894737  0.772727  0.829268   44.000000
5              0.909091  0.875000  0.891720   80.000000
accuracy       0.866667  0.866667  0.866667    0.866667
macro avg      0.840867  0.838414  0.836711  300.000000
weighted avg   0.876786  0.866667  0.869409  300.000000


In [15]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

dt_accuracy = accuracy_score(y_test, y_pred)

In [16]:
print("Decision Tree Model Evaluation")

print(f"Accuracy Score: {dt_accuracy * 100:.2f}%\n")

print("Classification Report: \n")
report = classification_report(y_test, y_pred, output_dict=True)
dt_class_report = pd.DataFrame(report).transpose()
print(dt_class_report)

Decision Tree Model Evaluation
Accuracy Score: 73.67%

Classification Report: 

              precision    recall  f1-score     support
0              0.880000  0.904110  0.891892   73.000000
1              0.611111  0.550000  0.578947   20.000000
2              0.769231  0.869565  0.816327   46.000000
3              0.437500  0.567568  0.494118   37.000000
4              0.558824  0.431818  0.487179   44.000000
5              0.876712  0.800000  0.836601   80.000000
accuracy       0.736667  0.736667  0.736667    0.736667
macro avg      0.688896  0.687177  0.684177  300.000000
weighted avg   0.742532  0.736667  0.736281  300.000000


In [17]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred)

In [18]:
print("Random Forest Model Evaluation")

print(f"Accuracy Score: {rf_accuracy * 100:.2f}%\n")

print("Classification Report: \n")
report = classification_report(y_test, y_pred, output_dict=True, digits=2)
rf_class_report = pd.DataFrame(report).transpose()
print(rf_class_report)

Random Forest Model Evaluation
Accuracy Score: 80.00%

Classification Report: 

              precision    recall  f1-score  support
0              0.806818  0.972603  0.881988     73.0
1              0.785714  0.550000  0.647059     20.0
2              0.976190  0.891304  0.931818     46.0
3              0.568627  0.783784  0.659091     37.0
4              0.916667  0.500000  0.647059     44.0
5              0.814815  0.825000  0.819876     80.0
accuracy       0.800000  0.800000  0.800000      0.8
macro avg      0.811472  0.753782  0.764482    300.0
weighted avg   0.820248  0.800000  0.795456    300.0


In [19]:
#param_grid={'C':[0.001,0.01,0.1,1,10,100], 'gamma':[0.001,0.01,0.1,1,10,100]}
#svm=RandomizedSearchCV(SVC(),param_grid,cv=5)
#svm.fit(X_train,y_train)

#y_pred = svm.predict(X_test)

#svm_accuracy = accuracy_score(y_test, y_pred)

from sklearn import svm

#Create a svm Classifier
svm = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
svm.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = svm.predict(X_test)

svm_accuracy = accuracy_score(y_test, y_pred)


In [20]:
print("Support Vector Machine Model Evaluation")

print(f"Accuracy Score: {svm_accuracy * 100:.2f}%\n")


print("Classification Report: \n")
report = classification_report(y_test, y_pred, output_dict=True)
svm_class_report = pd.DataFrame(report).transpose()
print(svm_class_report)

Support Vector Machine Model Evaluation
Accuracy Score: 84.33%

Classification Report: 

              precision    recall  f1-score     support
0              0.907895  0.945205  0.926174   73.000000
1              0.750000  0.750000  0.750000   20.000000
2              0.872340  0.891304  0.881720   46.000000
3              0.583333  0.756757  0.658824   37.000000
4              0.885714  0.704545  0.784810   44.000000
5              0.932432  0.862500  0.896104   80.000000
accuracy       0.843333  0.843333  0.843333    0.843333
macro avg      0.821953  0.818385  0.816272  300.000000
weighted avg   0.855178  0.843333  0.845888  300.000000


In [21]:
knn_accuracy = []

for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    knn_accuracy.append(metrics.accuracy_score(y_test, y_pred))

print("Maximum accuracy: ",max(knn_accuracy),"at K =",knn_accuracy.index(max(knn_accuracy)))

Maximum accuracy:  0.68 at K = 30


In [22]:
print("KNN Model Evaluation")

print(f"Accuracy Score: {max(knn_accuracy) * 100:.2f}%\n")

print("Classification Report: \n")
report = classification_report(y_test, y_pred, output_dict=True)
knn_class_report = pd.DataFrame(report).transpose()
print(knn_class_report)

KNN Model Evaluation
Accuracy Score: 68.00%

Classification Report: 

              precision    recall  f1-score     support
0              0.571429  0.876712  0.691892   73.000000
1              0.000000  0.000000  0.000000   20.000000
2              0.891304  0.891304  0.891304   46.000000
3              0.586207  0.459459  0.515152   37.000000
4              0.833333  0.227273  0.357143   44.000000
5              0.700000  0.875000  0.777778   80.000000
accuracy       0.673333  0.673333  0.673333    0.673333
macro avg      0.597046  0.554958  0.538878  300.000000
weighted avg   0.656902  0.673333  0.628351  300.000000


In [23]:
from sklearn.naive_bayes import GaussianNB

# Initialize naive bayes classifier
gnb = GaussianNB()

# Train naive bayes classifier
NB = gnb.fit(X_train,y_train) 

#Predict the response for test dataset
y_pred = gnb.predict(X_test)

gnb_accuracy = accuracy_score(y_test, y_pred)


In [24]:
print("Naive Bayes")

print(f"Accuracy Score: {gnb_accuracy * 100:.2f}%\n")

print("Classification Report: \n")
report = classification_report(y_test, y_pred, output_dict=True)
gnb_class_report = pd.DataFrame(report).transpose()
print(gnb_class_report)

Naive Bayes
Accuracy Score: 60.33%

Classification Report: 

              precision    recall  f1-score     support
0              0.971429  0.931507  0.951049   73.000000
1              0.200000  0.750000  0.315789   20.000000
2              1.000000  0.717391  0.835443   46.000000
3              0.347826  0.432432  0.385542   37.000000
4              0.161290  0.113636  0.133333   44.000000
5              0.977778  0.550000  0.704000   80.000000
accuracy       0.603333  0.603333  0.603333    0.603333
macro avg      0.609720  0.582494  0.554193  300.000000
weighted avg   0.730343  0.603333  0.635415  300.000000


In [25]:
models = pd.DataFrame({
    'Model': ['Logistic Regression','Decision Tree',
              'Random Forest','SVM','KNN','Naive Bayes'],
    'Accuracy': [lr_accuracy, dt_accuracy, rf_accuracy,
                 svm_accuracy, max(knn_accuracy), gnb_accuracy],
    'Recall': [lr_class_report['recall']['1'],dt_class_report['recall']['1'],
               rf_class_report['recall']['1'],svm_class_report['recall']['1'],
               knn_class_report['recall']['1'],gnb_class_report['recall']['1']],
    'Precision': [lr_class_report['precision']['1'],dt_class_report['precision']['1'],
                  rf_class_report['precision']['1'],svm_class_report['precision']['1'],
                  knn_class_report['precision']['1'],gnb_class_report['precision']['1']],
    'F1-score': [lr_class_report['f1-score']['1'],dt_class_report['f1-score']['1'],
                 rf_class_report['f1-score']['1'],svm_class_report['f1-score']['1'],
                 knn_class_report['f1-score']['1'],gnb_class_report['f1-score']['1']]}).set_index('Model')

In [26]:
models.style.highlight_max(color = 'lightyellow', axis = 0)

Unnamed: 0_level_0,Accuracy,Recall,Precision,F1-score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0.866667,0.7,0.736842,0.717949
Decision Tree,0.736667,0.55,0.611111,0.578947
Random Forest,0.8,0.55,0.785714,0.647059
SVM,0.843333,0.75,0.75,0.75
KNN,0.68,0.0,0.0,0.0
Naive Bayes,0.603333,0.75,0.2,0.315789


In [1]:
import tensorflow as tf

# Build and train the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(1, input_shape=(X_train.shape[1],))
])
model.compile(optimizer='adam',loss='mean_squared_error')

model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=(X_test, y_test))

# Save the model
# tf.saved_model.save(model, '/path/to/model')
model.save("my_model")

SyntaxError: invalid non-printable character U+00A0 (1586127294.py, line 15)