In [30]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, MinMaxScaler

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

In [3]:
salary_ds = pd.read_csv('Salary Dataset.csv', index_col=0)

In [4]:
salary_ds.head()

Unnamed: 0_level_0,Job Title,Salaries Reported,Location,Salary
Company Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mu Sigma,Data Scientist,105.0,Bangalore,"₹6,48,573/yr"
IBM,Data Scientist,95.0,Bangalore,"₹11,91,950/yr"
Tata Consultancy Services,Data Scientist,66.0,Bangalore,"₹8,36,874/yr"
Impact Analytics,Data Scientist,40.0,Bangalore,"₹6,69,578/yr"
Accenture,Data Scientist,32.0,Bangalore,"₹9,44,110/yr"


In [5]:
# Since salary has text in it, Im going to get rid of that
salary_ds['Salary'] = salary_ds['Salary'].apply(lambda x:x.split(' ')[0])

In [6]:
for ls in ['₹', '/yr', ',']:
    for i in salary_ds['Salary']:
        if ls in i:
            salary_ds['Salary'] = i.replace(ls, '')

In [7]:
for i in salary_ds['Salary']:
    salary_ds['Salary'] = int(i)

In [8]:
salary_ds['Salary'].dtype

dtype('int64')

In [9]:
salary_ds.head()

Unnamed: 0_level_0,Job Title,Salaries Reported,Location,Salary
Company Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mu Sigma,Data Scientist,105.0,Bangalore,939843
IBM,Data Scientist,95.0,Bangalore,939843
Tata Consultancy Services,Data Scientist,66.0,Bangalore,939843
Impact Analytics,Data Scientist,40.0,Bangalore,939843
Accenture,Data Scientist,32.0,Bangalore,939843


In [10]:
cat_df = salary_ds.select_dtypes(include=object)
num_ds = salary_ds.select_dtypes(include=np.number)


In [11]:
ordinal = OrdinalEncoder()

In [12]:
cat_cols = cat_df.columns.tolist()
ordinal.fit(cat_df[cat_cols])

OrdinalEncoder()

In [13]:
cat_o = ordinal.transform(cat_df[cat_cols])
cat_o = pd.DataFrame(cat_o, columns=cat_cols)
cat_df.reset_index(inplace=True, drop=True)


In [14]:
num_ds.reset_index(inplace=True, drop=True)
cat_o.reset_index(inplace=True, drop=True)

salary_df = pd.concat([num_ds, cat_o], axis=1)
salary_df

Unnamed: 0,Salaries Reported,Salary,Job Title,Location
0,105.0,939843,8.0,0.0
1,95.0,939843,8.0,0.0
2,66.0,939843,8.0,0.0
3,40.0,939843,8.0,0.0
4,32.0,939843,8.0,0.0
...,...,...,...,...
4339,1.0,939843,21.0,2.0
4340,1.0,939843,18.0,2.0
4341,1.0,939843,18.0,2.0
4342,1.0,939843,25.0,2.0


In [15]:
salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4344 entries, 0 to 4343
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Salaries Reported  4342 non-null   float64
 1   Salary             4344 non-null   int64  
 2   Job Title          4344 non-null   float64
 3   Location           4344 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 135.9 KB


In [16]:
salary_df = salary_df.fillna(method='bfill', axis=0).fillna(0)

In [17]:
salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4344 entries, 0 to 4343
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Salaries Reported  4344 non-null   float64
 1   Salary             4344 non-null   int64  
 2   Job Title          4344 non-null   float64
 3   Location           4344 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 135.9 KB


In [18]:
X = salary_df.drop(['Salary'], axis=1)
y = salary_df['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [20]:
pipe = Pipeline([('StandardScaler', StandardScaler()),
                ('LinearRegression', LinearRegression())])

In [22]:
pipe.fit(X_train, y_train)
y_train

2992    939843
411     939843
4212    939843
1588    939843
3460    939843
         ...  
3444    939843
466     939843
3092    939843
3772    939843
860     939843
Name: Salary, Length: 3258, dtype: int64

In [34]:
test_preds = pipe.predict(X_test)
train_preds = pipe.predict(X_train)

In [36]:
print("Coefficient of determination for test: %.2f" % r2_score(y_test, test_preds))
print("Coefficient of determination for train: %.2f" % r2_score(y_train, train_preds))

Coefficient of determination for test: 1.00
Coefficient of determination for train: 1.00


In [37]:
print("Accuracy score for test: %.2f" % accuracy_score(y_test, test_preds))
print("Accuracy score for train: %.2f" % accuracy_score(y_train, train_preds))

Accuracy score for test: 1.00
Accuracy score for train: 1.00
