<a href="https://colab.research.google.com/github/eaedk/salary_prediction/blob/master/salary_prediction_standalone_001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation and Importation

In [1]:
import pandas as pd 
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Data Loadind and EDA

In [2]:
train = pd.read_csv("salary.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       15 non-null     object 
 1   test_score       19 non-null     float64
 2   interview_score  19 non-null     float64
 3   Salary           20 non-null     int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 768.0+ bytes


In [4]:
train.isna().sum()

experience         5
test_score         1
interview_score    1
Salary             0
dtype: int64

In [5]:
y_col = 'Salary'
useless_cols = [y_col, ]

In [6]:
cat_cols = list(train.loc[:, ~train.columns.isin(useless_cols)]\
               .select_dtypes(exclude='number').columns) 
num_cols = list(train.loc[:, ~train.columns.isin(useless_cols)]\
               .select_dtypes(include='number').columns)
print(f"n categorical columns: {len(cat_cols)} \nn numerical columns: {len(num_cols)} ")

n categorical columns: 1 
n numerical columns: 2 


# Preprocessing and Feature Engineering

In [7]:
cat_imputer = SimpleImputer(strategy="most_frequent")
num_imputer = SimpleImputer(strategy="mean")

In [8]:
cat_imputer.fit(train[cat_cols])
num_imputer.fit(train[num_cols])

SimpleImputer()

In [9]:
if len(cat_cols) > 0: 
    train[cat_cols] = cat_imputer.transform(train[cat_cols])
if len(num_cols) > 0: 
    train[num_cols] = num_imputer.transform(train[num_cols])

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       20 non-null     object 
 1   test_score       20 non-null     float64
 2   interview_score  20 non-null     float64
 3   Salary           20 non-null     int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 768.0+ bytes


In [11]:
encoder = OneHotEncoder(drop="if_binary", sparse=False)
if len(cat_cols) > 0: 
    encoder.fit(train[cat_cols])

In [12]:
if len(cat_cols) > 0: 
    encoded_cols = list(encoder.get_feature_names_out())
    train[encoded_cols] = encoder.transform(train[cat_cols])

In [13]:
useful_cols = num_cols+encoded_cols

In [14]:
train[useful_cols].head()

Unnamed: 0,test_score,interview_score,experience_eleven,experience_fifteen,experience_five,experience_four,experience_one,experience_six,experience_ten,experience_thirteen,experience_three,experience_twelve
0,8.0,8.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,9.0,9.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
scaler = StandardScaler()
scaler.fit(train[useful_cols])

StandardScaler()

# ML Modeling

In [16]:
if pd.api.types.is_numeric_dtype(train[y_col]):
    X_train, X_valid, y_train, y_valid = train_test_split(scaler.transform(train[useful_cols]), train[y_col].values, test_size=0.33, random_state=42)
else:
    X_train, X_valid, y_train, y_valid = train_test_split(scaler.transform(train[useful_cols]), train[y_col].values, test_size=0.33, random_state=42, stratify=train[y_col])

In [17]:
model_gb = GradientBoostingRegressor( learning_rate=0.1, n_estimators=50, random_state=10)

In [18]:
model_gb.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=50, random_state=10)

In [19]:
mean_squared_error(y_valid,model_gb.predict(X_valid))

34543190.31452102

# Exportation

In [20]:
import pickle

In [21]:
# function to use later in an app 
def apply_processing(dataframe,cateogrical_imputer,
                     numerical_imputer,encoder,scaler,
            numerical_cols=["name_numerical_col_001"],
            categorical_cols=["name_categorical_col_001"]):
    "Straightforward pipeline to apply the preprocessing and the feature engineering over and over again"

    df_ = dataframe.copy()

    df_[categorical_cols] = cateogrical_imputer.transform(df_[categorical_cols])
    df_[numerical_cols] = numerical_imputer.transform(df_[numerical_cols])

    encoded_cols = list(encoder.get_feature_names_out())
    df_[encoded_cols] = encoder.transform(df_[categorical_cols])

    useful_cols = numerical_cols+encoded_cols
    df_[useful_cols] = scaler.transform(df_[useful_cols])

    return df_[useful_cols]

In [22]:
objs_for_processing = dict(cateogrical_imputer=cat_imputer,
numerical_imputer=num_imputer,
encoder=encoder,
scaler=scaler, numerical_cols=num_cols,
categorical_cols=cat_cols)

In [23]:
# Save the objects for processing as a dict in a file
with open('processing.pkl', 'wb') as f:
    pickle.dump(objs_for_processing, f, pickle.HIGHEST_PROTOCOL)

In [24]:
# Save the model object for processing as a dict in a file
with open('model.pkl', 'wb') as f:
    pickle.dump(model_gb, f, pickle.HIGHEST_PROTOCOL)