<a href="https://colab.research.google.com/github/chandankr014/AmbitionBox-DataScrap/blob/main/Salary_Pred_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


In [2]:
# Load the dataset
data = pd.read_csv("new_pred.csv")  # Replace "your_dataset.csv" with the actual file path

# Perform EDA if necessary
data

Unnamed: 0,Job Profile,Company,ExperienceRequired,AvgSalary
0,Scientist,reliance-nippon-life-insurance,5.5,581665
1,Branch Manager,dcb-bank,8.0,604018
2,Team Lead Operations,itc,5.0,347114
3,QC Officer,lifestyle,7.5,408028
4,Junior Engineer,reckitt-benckiser,12.0,316633
...,...,...,...,...
4995,Operation Staff,kochartech,5.0,321633
4996,Quality Inspector,s&p-global,8.5,1264704
4997,Service Supervisor,synechron,3.5,491565
4998,Technology Analyst,infosys,5.0,708817


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Job Profile         5000 non-null   object 
 1   Company             5000 non-null   object 
 2   ExperienceRequired  5000 non-null   float64
 3   AvgSalary           5000 non-null   int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 156.4+ KB


In [13]:
# Split data into features and target
X = data.drop(columns=["AvgSalary"])
y = data["AvgSalary"]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
data["AvgSalary"].mean()

683769.436

In [20]:
_, mean, std, min,_,_,_, max = data['AvgSalary'].describe()

In [21]:
mean, std, min, max

(683769.436, 631345.3110603876, 116227.0, 7591845.0)

In [29]:
# Define salary thresholds for each hierarchy level
salary_thresholds = [50000, 75000, 100000, 150000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 10000000]

# Define a function to assign hierarchy level based on average salary
def assign_hierarchy_level(avg_salary):
    for i, threshold in enumerate(salary_thresholds):
        if avg_salary < threshold:
            return i
    return len(salary_thresholds)


In [30]:
# Apply the function to create the new column
data['HierarchyLevel'] = data['AvgSalary'].apply(assign_hierarchy_level)


In [33]:
data['HierarchyLevel'].sort_values(ascending=False)
data['HierarchyLevel'].value_counts()

5     1044
6      924
13     705
7      625
8      429
9      312
10     239
11     184
14     171
4      150
12     128
15      50
3       23
16      10
17       4
19       1
18       1
Name: HierarchyLevel, dtype: int64

In [35]:
cols = ['HierarchyLevel', 'Company', 'ExperienceRequired', 'AvgSalary']
new_data = data[cols]

In [37]:
new_data.tail()

Unnamed: 0,HierarchyLevel,Company,ExperienceRequired,AvgSalary
4995,6,kochartech,5.0,321633
4996,13,s&p-global,8.5,1264704
4997,7,synechron,3.5,491565
4998,10,infosys,5.0,708817
4999,7,johnson-controls,4.5,429423


In [38]:
# new_data.to_csv("hierachy_data.csv")

In [40]:
new_data['Company'].unique().size

500

In [41]:
company_hierarchy = {
    'reliance-nippon-life-insurance': 95,
    'dcb-bank': 50,
    'itc': 80,
    'lifestyle': 45,
    'reckitt-benckiser': 90,
    'siemens': 85,
    'bt-global-services': 70,
    'nokia-networks': 65,
    'indian-oil-corporation': 100,
    'exide-life-insurance': 55,
    'au-small-finance-bank': 40,
    'wns': 35,
    'arcelormittal-nippon-steel': 90,
    'exotic-learning': 25,
    'capgemini-engineering': 65,
    'fullerton-india': 45,
    'indusind-bank': 80,
    'tvs-credit': 55,
    'samvardhana-motherson-group': 75,
    'dalmia-bharat-cement': 95,
    'sobha': 90,
    'jubilant-foods-works': 55,
    'cisco': 100,
    'tractors-and-farm-equipment': 95,
    'tata-motors-finance': 75,
    '3i-infotech': 35,
    'cams': 65,
    'intelenet-global-services': 50,
    'icici-prudential-life-insurance': 90,
    'icici-bank': 95,
    'infosys': 100,
    'poonawalla-fincorp': 85,
    'ford-motor': 95,
    'dilip-buildcon': 75,
    'metro': 90,
    'bank-of-america': 100,
    'act-fibernet': 55,
    'wipro-consumer-care-&-lighting': 65,
    'iks-health': 85,
    'jsw-steel': 95,
    'asian-paints': 100,
    'synechron': 75,
    'lenskart': 55,
    'sify-technologies': 50,
    'ust': 85,
    'ntt-data': 80,
    'xl-dynamics': 75,
    'cg-power-and-industrial-solutions': 80,
    'the-himalaya-drug-company': 75,
    'kec-international': 75,
    'birlasoft': 65,
    "domino's-pizza": 50,
    'igt-solutions': 55,
    'vedantu': 65,
    'bharat-electronics': 95,
    'eureka-forbes': 75,
    'state-bank-of-india': 95,
    'tata-elxsi': 85,
    'birla-sun-life-insurance': 90,
    'parle-products': 75,
    'reliance-industries': 100,
    'utkarsh-small-finance-bank': 50,
    'intas-pharmaceuticals': 95,
    'bureau-veritas': 80,
    'jmc-projects': 75,
    'paras-hospital': 85,
    'blue-dart-express': 80,
    'rivigo': 65,
    'cogent-e-services': 75,
    'reliance-general-insurance': 85,
    'jll': 80,
    'sbi-life-insurance-company': 95,
    'grasim-industries': 95,
    'home-credit-finance': 80,
    'optum': 100,
    'i-process-services': 55,
    'jcb': 95,
    'biocon': 90,
    'berger-paints': 80,
    'blue-star': 90,
    'dabur': 95,
    'vishal-mega-mart': 75,
    'force-motors': 80,
    'alembic-pharmaceuticals': 90,
    'microland': 65,
    'fortis-healthcare': 95,
    'conneqt-business-solutions': 75,
    'hcl-infosystems': 75,
    'natwest-group': 100,
    'dtdc-express': 75,
    'itc-hotels': 80,
    'mahindra-&-mahindra': 100,
    'reliance-trends': 80,
}


In [42]:
company_hierarchy_updated = {
    'reliance-nippon-life-insurance': 90,
    'dcb-bank': 50,
    'itc': 85,
    'lifestyle': 45,
    'reckitt-benckiser': 90,
    'siemens': 80,
    'bt-global-services': 70,
    'nokia-networks': 65,
    'indian-oil-corporation': 100,
    'exide-life-insurance': 55,
    'au-small-finance-bank': 40,
    'wns': 35,
    'arcelormittal-nippon-steel': 90,
    'exotic-learning': 25,
    'capgemini-engineering': 65,
    'fullerton-india': 45,
    'indusind-bank': 80,
    'tvs-credit': 55,
    'samvardhana-motherson-group': 75,
    'dalmia-bharat-cement': 95,
    'sobha': 90,
    'jubilant-foods-works': 45,
    'cisco': 100,
    'tractors-and-farm-equipment': 90,
    'tata-motors-finance': 75,
    '3i-infotech': 35,
    'cams': 50,
    'intelenet-global-services': 55,
    'icici-prudential-life-insurance': 85,
    'icici-bank': 95,
    'infosys': 95,
    'poonawalla-fincorp': 65,
    'ford-motor': 95,
    'dilip-buildcon': 80,
    'metro': 85,
    'bank-of-america': 100,
    'act-fibernet': 55,
    'wipro-consumer-care-&-lighting': 60,
    'iks-health': 85,
    'jsw-steel': 95,
    'asian-paints': 100,
    'synechron': 75,
    'lenskart': 55,
    'sify-technologies': 55,
    'ust': 80,
    'ntt-data': 75,
    'xl-dynamics': 70,
    'cg-power-and-industrial-solutions': 75,
    'the-himalaya-drug-company': 60,
    'kec-international': 70,
    'birlasoft': 60,
    "domino's-pizza": 45,
    'igt-solutions': 50,
    'vedantu': 60,
    'bharat-electronics': 95,
    'eureka-forbes': 70,
    'state-bank-of-india': 95,
    'tata-elxsi': 80,
    'birla-sun-life-insurance': 85,
    'parle-products': 60,
    'reliance-industries': 100,
    'utkarsh-small-finance-bank': 40,
    'intas-pharmaceuticals': 95,
    'bureau-veritas': 80,
    'jmc-projects': 70,
    'paras-hospital': 85,
    'blue-dart-express': 75,
    'rivigo': 50,
    'cogent-e-services': 60,
    'reliance-general-insurance': 80,
    'jll': 75,
    'sbi-life-insurance-company': 95,
    'grasim-industries': 95,
    'home-credit-finance': 70,
    'optum': 100,
    'i-process-services': 50,
    'jcb': 95,
    'biocon': 85,
    'berger-paints': 80,
    'blue-star': 85,
    'dabur': 90,
    'vishal-mega-mart': 65,
    'force-motors': 80,
    'alembic-pharmaceuticals': 90,
    'microland': 55,
    'fortis-healthcare': 95,
    'conneqt-business-solutions': 60,
    'hcl-infosystems': 65,
    'natwest-group': 100,
    'dtdc-express': 60,
    'itc-hotels': 75,
    'mahindra-&-mahindra': 100,
    'reliance-trends': 75,
}


In [15]:
# # Preprocess categorical features
# categorical_features = ["Job Profile", "Company"]
# X_train_cat = X_train[categorical_features]
# X_test_cat = X_test[categorical_features]

# imputer = SimpleImputer(strategy='constant', fill_value='missing')
# encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# X_train_cat_encoded = encoder.fit_transform(imputer.fit_transform(X_train_cat))
# X_test_cat_encoded = encoder.transform(imputer.transform(X_test_cat))

# # Preprocess numerical features
# numeric_features = ["ExperienceRequired"]
# X_train_num = X_train[numeric_features]
# X_test_num = X_test[numeric_features]

# scaler = StandardScaler()
# X_train_num_scaled = scaler.fit_transform(X_train_num)
# X_test_num_scaled = scaler.transform(X_test_num)

# # Concatenate encoded categorical features and scaled numerical features
# X_train_processed = pd.concat([pd.DataFrame(X_train_cat_encoded), pd.DataFrame(X_train_num_scaled)], axis=1)
# X_test_processed = pd.concat([pd.DataFrame(X_test_cat_encoded), pd.DataFrame(X_test_num_scaled)], axis=1)

# # Define models to be tested
# models = [
#     ('Linear Regression', LinearRegression()),
#     ('Random Forest', RandomForestRegressor()),
#     ('SVR', SVR()),
#     ('PCA', PCA()),
#     ('Naive Bayes', GaussianNB()),
#     ('XGBoost', XGBRegressor()),
#     ('KNN', KNeighborsRegressor())
# ]

# # Fine-tune models and check accuracy using GridSearchCV
# for name, model in models:
#     param_grid = {}
#     if name == 'Random Forest':
#         param_grid = {
#             'n_estimators': [100],
#             'max_depth': [None, 10]
#         }
#     elif name == 'SVR':
#         param_grid = {
#             'kernel': ['linear', 'rbf'],
#             'C': [0.1, 1]
#         }
#     elif name == 'PCA':
#         param_grid = {
#             'n_components': [None, 5]
#         }
#     elif name == 'XGBoost':
#         param_grid = {
#             'n_estimators': [100],
#             'max_depth': [3, 5]
#         }
#     elif name == 'KNN':
#         param_grid = {
#             'n_neighbors': [3, 5],
#             'weights': ['uniform', 'distance']
#         }

#     grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
#     grid_search.fit(X_train_processed, y_train)

#     print(f"Best parameters for {name}: {grid_search.best_params_}")
#     print(f"Best score for {name}: {-grid_search.best_score_}")

#     # Evaluate model on test data
#     y_pred = grid_search.predict(X_test_processed)
#     mse = mean_squared_error(y_test, y_pred)
#     rmse = mse ** 0.5
#     print(f"RMSE for {name}: {rmse}")