In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("Employee Salary prediction.csv")

In [None]:
data.shape

In [None]:
data.head(10)

In [None]:
data = data.drop(columns=[
    'posting_date',
    'application_deadline',
    'company_name'
])
data = data.drop(columns=['required_skills'])
data = data.drop(columns=['employee_residence'])
data = data.drop(columns=['job_description_length'])
data = data.drop(columns=['benefits_score'])
data=data.drop(columns=['job_id'])
data = data.drop(columns=['salary_currency'])

In [None]:
# Map company locations to regions
def map_company_region(country):
    asia = ['India', 'China', 'Japan', 'Singapore', 'Israel', 'South Korea']
    europe = ['Germany', 'Denmark', 'France', 'Austria', 'Sweden', 'Ireland', 'Switzerland',
              'Finland', 'Netherlands', 'United Kingdom', 'Norway']
    america = ['United States', 'Canada']
    australia = ['Australia']

    if country in asia:
        return 'Asia'
    elif country in europe:
        return 'Europe'
    elif country in america:
        return 'America'
    elif country in australia:
        return 'Other'
    else:
        return 'Other'

# Apply the mapping
data['company_location'] = data['company_location'].apply(map_company_region)

# (Optional) Preview the result
print(data['company_location'].value_counts())


In [None]:
# map job titles to broader categories
def map_job_title(title):
    title = title.lower()
    if 'engineer' in title:
        return 'Engineer'
    elif 'scientist' in title:
        return 'Scientist'
    elif 'analyst' in title:
        return 'Analyst'
    elif 'manager' in title or 'head' in title:
        return 'Manager'
    elif 'consultant' in title or 'specialist' in title:
        return 'Consultant'
    elif 'architect' in title:
        return 'Architect'
    elif 'research' in title:
        return 'Research'
    else:
        return 'Other'

# Apply the function and replace the original column
data['job_title'] = data['job_title'].apply(map_job_title)

# Optional: Check the value counts
print(data['job_title'].value_counts())


In [None]:
data.shape

In [None]:

# (1) Clean column names
data.columns = data.columns.str.strip()

# (2) Define salary bins (7 bins => 8 edges)
salary_bins = [0, 70000, 100000, 140000,  240000, 420000]

# (3) Define labels for each bin (must be 7 labels)
salary_labels = [
    '0–70k',
    '70k–100k',
    '100k–140k',
    '160k–240k',
    '240k+'
]

# (4) Create the categorical column
if 'salary_usd' in data.columns:
    data['salary_range'] = pd.cut(
        data['salary_usd'],
        bins=salary_bins,
        labels=salary_labels,
        include_lowest=True
    )
    # Optional: drop the original salary column
    data.drop(columns=['salary_usd'], inplace=True)

    # (5) Display class distribution
    print(data['salary_range'].value_counts().sort_index())
else:
    print("Error: 'salary_usd' column not found.")


In [None]:
data = data[data['salary_range'] != '240k+']


In [None]:
import matplotlib.pyplot as plt   #visualization


In [None]:
plt.boxplot(data['industry'])
plt.show()
print(data['industry'].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
data['job_title']=encoder.fit_transform(data['job_title'])
data['experience_level']=encoder.fit_transform(data['experience_level'])
data['company_location']=encoder.fit_transform(data['company_location'])
data['company_size']=encoder.fit_transform(data['company_size'])
data['employment_type']=encoder.fit_transform(data['employment_type'])
data['remote_ratio']=encoder.fit_transform(data['remote_ratio'])
data['education_required']=encoder.fit_transform(data['education_required'])
data['industry']=encoder.fit_transform(data['industry'])

In [None]:
data.head(5)

In [None]:
x=data.drop(columns=['salary_range'])
y=data['salary_range']
x

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=32)

models = {
    "LogisticRegression": LogisticRegression(),
    "RandomForest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "GradientBoosting": GradientBoostingClassifier()
}

results = {}

for name, model in models.items():
    # Only scale for models that need it
    if name in ["LogisticRegression", "SVM", "KNN"]:
        pipe = Pipeline([
            ('model', model)
        ])
    else:
        pipe = Pipeline([
            ('model', model)
        ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt
plt.bar(results.keys(), results.values(), color='skyblue')
plt.ylabel('Accuracy Score')
plt.title('Model Comparison')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=32)

# Define models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "GradientBoosting": GradientBoostingClassifier()
}

results = {}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name}: {acc:.4f}")

# Get best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\n✅ Best model: {best_model_name} with accuracy {results[best_model_name]:.4f}")

# Save the best model
joblib.dump(best_model, "best_model.pkl")
print("✅ Saved best model as best_model.pkl")
