#5. Salary Prediction Project - Web app with Streamlit
by: Cindy Suyitno

##5.1. Web app for Salary Prediction in Indonesia


Files that need to be uploaded in this google collab are:
- saved_province.pkl
- indonesia-province-jml-penduduk.json (from https://www.kaggle.com/code/farizdarari/simple-map-visualization-using-geopandas/data)
- saved_model.pkl

Files that we will get from this google collab are:
- saved_map.pkl (will be used in the web app; check Geopandas part first)
- app.py

In [3]:
#libraries that are not in google colab, need to be installed first
!pip install -q streamlit
!pip install pyngrok
!pip install geopandas

[K     |████████████████████████████████| 9.1 MB 4.8 MB/s 
[K     |████████████████████████████████| 181 kB 51.6 MB/s 
[K     |████████████████████████████████| 111 kB 54.7 MB/s 
[K     |████████████████████████████████| 164 kB 57.9 MB/s 
[K     |████████████████████████████████| 78 kB 6.9 MB/s 
[K     |████████████████████████████████| 232 kB 60.4 MB/s 
[K     |████████████████████████████████| 4.3 MB 45.3 MB/s 
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
[K     |████████████████████████████████| 133 kB 58.2 MB/s 
[K     |████████████████████████████████| 428 kB 47.2 MB/s 
[K     |████████████████████████████████| 132 kB 62.8 MB/s 
[K     |████████████████████████████████| 793 kB 48.8 MB/s 
[K     |████████████████████████████████| 381 kB 61.6 MB/s 
[K     |████████████████████████████████| 51 kB 7.2 MB/s 
[?25h  Building wheel for blinker (setup.py) ... [?25l[?25hdone
  Building wheel for validators (setup.py) ... [?25l[?25hdone
[31mERROR: pip's depe

In [18]:
%%writefile app.py
#^ to write/rewrite app.py everytime this cell runs

#importing libraries
import streamlit as st
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd

#opening saved model
with open('saved_model.pkl','rb') as file:
  data = pickle.load(file)

model = data['model']
job_label = data['job_label']
province_label = data['province_label']
years = data['years']
pol_reg = data['pol_reg']
sc = data['sc']
df = data['df']

st.title('Salary Prediction in Indonesia')
st.write('''### Please input some information as follow:''')

job_list = ['Factory and Manufacturer',
            'Sales',
            'Internet & New Media',
            'Accounting, Finance, Banking',
            'Art & Design',
            'Business',
            'Hospitality & Travel',
            'Administration',
            'Marketing',
            'Human Resources & General Affair',
            'Data Analyst & Data Science',
            'Research & Development'
            'Public Relation']
job = st.selectbox('Job Type',job_list)

province_list = ['Jakarta','Jawa','Sumatera','Kalimantan','Sulawesi','Bali dan Nusa Tenggara','Other']
province = st.selectbox('Province',province_list)

experience = st.slider('Years of experience', 0, 20, 2)

start = st.button('Calculate Salary')

if start:
  X = pd.DataFrame([[job,experience,province]])
  X[0][0] = job_label.transform(np.array(X[0][0]).reshape(-1,1))
  X[2][0] = province_label.transform(np.array(X[2][0]).reshape(-1,1))
  X.astype(float)
  salary = sc.inverse_transform(model.predict(X).reshape(-1,1))
  st.subheader(f'The estimated salary is IDR{salary[0][0]:.0f}')

  df_new = df.loc[(df['Job'] == job)]

  st.write('\n')
  st.write('''\n See the scatter of your job per province:''') #i use DataCleaned for province records
  
  with open('saved_map.pkl','rb') as file:
    data = pickle.load(file)
  
  df_prov = data['prov']
  df_geo = data['geo']

  df = df_prov[df_prov['Job']==job]
  df1 = df['Propinsi'].value_counts().rename_axis('Propinsi').reset_index(name='counts')
  df2 = pd.merge(df_geo[['Propinsi','geometry']], df1, on ='Propinsi', how ='left')
  df2 = df2.replace(np.nan,0)

  fig = plt.figure(figsize=(10, 6))
  ax = fig.add_subplot()
  df2.plot(column='counts', ax=ax, legend=True, legend_kwds={'label': "Number of Job Vacancy in the Province", 'orientation': "horizontal"})
  st.pyplot(fig) 
  st.write('\n')

  st.write('''\n Mean salary per province and the comparison with regional minimum wage (UMR):''')
  
  fgr = plt.figure(figsize=(10, 6))
  ax = fgr.add_subplot()   
  
  colors = ['#ff9f1c','#ffbf69','#f2cc8f','#98c1d9','#2ec4b6','#43aa8b','#a3b18a']
  
  dict_UMR = {'Bali dan Nusa Tenggara': [2209294,'2,209,294'],'Jakarta':[4416186,'4.416.186'],'Jawa': [1940821, '1.940.821'] ,'Kalimantan': [2832495,'2,832,495'],
            'Other': [2994448,'2,994,448'], 'Sulawesi': [2748425,'2,748,425'],'Sumatera': [2747452,'2.747.452']}
  x_label = list(dict_UMR.keys())
  
  for x in x_label:
    if x not in df_new['Province'].unique():
      x_label.remove(x)
  
  ax.bar(x_label, df_new.groupby(['Province'])['Salary'].mean(), color = colors[0:len(x_label)])
  
  UMR_list = []
  UMR_label = []
  for x in x_label:
    UMR_list.append(dict_UMR[x][0])
    UMR_label.append(dict_UMR[x][1])
  
  ax.plot(x_label, UMR_list, label = 'Rata-rata UMR 2021', color = 'k', linewidth = 3)
  for i in range(len(x_label)):
    plt.text(x=i,y=UMR_list[i]+1100000, s=str(UMR_label[i]), size=12, ha='center', va='center')
  
  plt.ylabel('Mean Salary in 10 million IDR', fontsize = 14)
  ax.tick_params(axis='x', labelrotation=45, labelsize=14)
  ax.tick_params(axis='y', labelsize=14)
  ax.set_title('Mean Salary per Province in Indonesia', fontsize=15, fontweight='bold')
  plt.legend()
  
  st.pyplot(fgr) 
  st.write('\n')

  st.write('''Mean salary based on years of experience:''')
  
  figure = plt.figure(figsize=(10, 6))
  ax = figure.add_subplot()
  
  years_list = [0, 0.5, 2, 4, 7.5, 15]
  
  ax.plot(years_list,df_new.groupby(['Years'])['Salary'].mean())
  plt.ylabel('Mean Salary in 10 million IDR', fontsize = 14)
  plt.xlabel('Years of experience', fontsize = 14)
  ax.tick_params(axis='x', labelsize=14)
  ax.tick_params(axis='y', labelsize=14)
  ax.set_title('Mean Salary per Years of Experience', fontsize=15, fontweight='bold')
  
  st.pyplot(figure)
  st.write('\n')

Overwriting app.py


In [19]:
#running to local tunnel
!streamlit run app.py & npx localtunnel --port 8501

2022-06-20 03:00:33.747 INFO    numexpr.utils: NumExpr defaulting to 2 threads.
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.2:8501[0m
[34m  External URL: [0m[1mhttp://35.237.249.249:8501[0m
[0m
[K[?25hnpx: installed 22 in 2.8s
your url is: https://deep-bananas-train-35-237-249-249.loca.lt
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[0][0] = job_label.transform(np.array(X[0][0]).reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[2][0] = province_label.transform(np.array(X[2][0]).reshape(-1,1))
  y = column_or_1d(y, warn=True)
A value is 

##5.2 Importing Geomaps for Indonesia Provinces

In [None]:
#libraries used
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd

df_geo = gpd.read_file('/content/indonesia-province-jml-penduduk.json.zip')

#changing some province names so that they are aligned with our data
df_geo['Propinsi'] = df_geo['Propinsi'].str.title()
df_geo['Propinsi'] = df_geo['Propinsi'].replace('Dki Jakarta','Jakarta')
df_geo['Propinsi'] = df_geo['Propinsi'].replace('Irian Jaya Timur','Papua')
df_geo['Propinsi'] = df_geo['Propinsi'].replace('Nusatenggara Barat','NTB')
df_geo['Propinsi'] = df_geo['Propinsi'].replace('Daerah Istimewa Yogyakarta','Yogyakarta')
df_geo['Propinsi'] = df_geo['Propinsi'].replace('Nusa Tenggara Timur','NTT')
df_geo['Propinsi'] = df_geo['Propinsi'].replace('Di. Aceh','Aceh')

In [None]:
df_geo

Unnamed: 0,ID,kode,Propinsi,SUMBER,Jumlah Penduduk,geometry
0,1,85,Papua,Peta Dasar BAKOSURTANAL Skala 1 : 250.000,1416690,"MULTIPOLYGON (((137.91666 -1.49852, 137.89260 ..."
1,2,52,NTB,Peta Dasar BAKOSURTANAL Skala 1 : 250.000,4500212,"MULTIPOLYGON (((117.62720 -8.50640, 117.62630 ..."
2,3,75,Gorontalo,Peta Dasar BAKOSURTANAL Skala 1 : 250.000,1040164,"MULTIPOLYGON (((122.18814 1.04530, 122.18883 1..."
3,4,74,Sulawesi Tenggara,Peta Dasar BAKOSURTANAL Skala 1 : 250.000,2232586,"MULTIPOLYGON (((120.98423 -2.83534, 120.98502 ..."
4,5,34,Yogyakarta,Peta Dasar BAKOSURTANAL Skala 1 : 250.000,3457491,"POLYGON ((110.01183 -7.88690, 110.01330 -7.885..."
5,6,3329,Jawa Tengah,Peta Dasar BAKOSURTANAL Skala 1 : 250.000,32382657,"MULTIPOLYGON (((108.82934 -6.74608, 108.83061 ..."
6,7,36,Probanten,Peta Dasar BAKOSURTANAL Skala 1 : 250.000,10632166,"MULTIPOLYGON (((106.72134 -6.09000, 106.71294 ..."
7,8,35,Jawa Timur,Peta Dasar BAKOSURTANAL Skala 1 : 250.000,37476757,"MULTIPOLYGON (((111.69460 -6.75286, 111.69649 ..."
8,9,81,Maluku Utara,Peta Dasar BAKOSURTANAL Skala 1 : 250.000,1038087,"MULTIPOLYGON (((127.74207 0.81629, 127.74280 0..."
9,10,82,Maluku,Peta Dasar BAKOSURTANAL Skala 1 : 250.000,1533506,"MULTIPOLYGON (((130.83130 -3.47141, 130.83050 ..."


In [None]:
#now our own data, will use the job and province
with open('saved_province.pkl','rb') as file:
  data = pickle.load(file)

df_prov = pd.concat([data['job'],data['province']],axis=1) 
df_prov.columns = ['Job','Propinsi'] #aligning the column name as df_geo
df_prov

Unnamed: 0,Job,Propinsi
0,Art & Design,Jakarta
1,"Accounting, Finance, Banking",Jakarta
2,Sales,Jakarta
3,Sales,Jawa Barat
4,Factory and Manufacturer,Jakarta
...,...,...
7077,Factory and Manufacturer,Yogyakarta
7078,Art & Design,Jakarta
7079,"Accounting, Finance, Banking",Jakarta
7080,Internet & New Media,Jawa Barat


In [None]:
#save the dataframes
import pickle
data = {'prov': df_prov,'geo':df_geo}
with open('saved_map.pkl','wb') as file:
  pickle.dump(data, file)