<a href="https://colab.research.google.com/github/danielbehargithub/LinkedIn_Salary/blob/main/unused/Feature_Importance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
import pandas as pd
import numpy as np

# Generate synthetic data
data = pd.DataFrame({
    "Experience_Years": np.random.randint(3, 10, size=1000),
    "Python": np.random.choice([0, 1], size=1000, p=[0.5, 0.5]),
    "SQL": np.random.choice([0, 1], size=1000, p=[0.5, 0.5]),
    "Education_Level": np.random.choice([1, 2, 3], size=1000, p=[0.3, 0.5, 0.2]),
    "Job_Title": np.random.choice(["Data Scientist", "Software Engineer", "ML Engineer"], size=1000)
})

# Function to compute feature importances for a specific job title
def compute_feature_importances_for_job(job_title, data):
    # Assign labels: 1 for the target job, 0 for other jobs
    data["Label"] = np.where(data["Job_Title"] == job_title, 1, 0)

    # Split positive and negative examples
    positive_data = data[data["Label"] == 1]
    negative_data = data[data["Label"] == 0]

    # Balance the dataset
    negative_data_balanced = resample(negative_data,
                                      replace=True,
                                      n_samples=len(positive_data),
                                      random_state=42)

    balanced_data = pd.concat([positive_data, negative_data_balanced])

    # Select features and labels
    X = balanced_data[["Experience_Years", "Python", "SQL", "Education_Level"]]
    y = balanced_data["Label"]

    # Train the model
    model = RandomForestClassifier(random_state=42)
    model.fit(X, y)

    # Compute feature importances
    importances = model.feature_importances_
    feature_importances = {feature: round(importance, 2) for feature, importance in zip(X.columns, importances)}

    # Return feature importances
    return feature_importances

# List of unique job titles
job_titles = data["Job_Title"].unique()

# Compute feature importances for each job title
all_feature_importances = {}
for job in job_titles:
    print(f"Computing feature importances for: {job}")
    feature_importances = compute_feature_importances_for_job(job, data)
    all_feature_importances[job] = feature_importances
    print(feature_importances)
    print("-" * 40)

# Display the feature importances for all job titles
# print("Feature Importances per Job Title:")
# for job, importances in all_feature_importances.items():
#     print(f"{job}:")
#     for feature, importance in importances.items():
#         print(f"  {feature}: {importance:.2f}")
#     print("-" * 40)



Computing feature importances for: Software Engineer
{'Experience_Years': 0.56, 'Python': 0.11, 'SQL': 0.12, 'Education_Level': 0.21}
----------------------------------------
Computing feature importances for: Data Scientist
{'Experience_Years': 0.55, 'Python': 0.1, 'SQL': 0.13, 'Education_Level': 0.22}
----------------------------------------
Computing feature importances for: ML Engineer
{'Experience_Years': 0.54, 'Python': 0.12, 'SQL': 0.11, 'Education_Level': 0.23}
----------------------------------------


In [27]:
# Function to calculate normalized candidate score
def calculate_candidate_score(candidate, weights, max_feature_values):
    # Normalize weights to ensure the sum equals 1
    total_weight = sum(weights.values())
    normalized_weights = {key: value / total_weight for key, value in weights.items()}

    # Calculate the max possible score
    max_possible_score = sum(normalized_weights[feature] * max_feature_values[feature] for feature in normalized_weights)

    # Calculate the candidate's score
    candidate_score = sum(candidate[feature] * normalized_weights[feature] for feature in normalized_weights)

    # Normalize candidate score to range [0, 1]
    normalized_score = candidate_score / max_possible_score
    return round(normalized_score, 2)

# Example: scoring a new candidate for "Data Scientist"
new_candidate = {"Experience_Years": 5, "Python": 1, "SQL": 1, "Education_Level": 2}

# Feature importance weights for "Data Scientist"
weights = all_feature_importances[job]

# Maximum values for each feature
max_feature_values = {"Experience_Years": 10, "Python": 1, "SQL": 1, "Education_Level": 3}

# Call the updated function with all required parameters
candidate_score = calculate_candidate_score(new_candidate, weights, max_feature_values)

# Print the candidate's normalized score
print(f"Candidate Score for {job}: {candidate_score}")


Candidate Score for Data Scientist: 0.54
