In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=d152b0496c997fe5c240ca04a2415d3da18aa3ffe717ab4b7b7940cc527de8b4
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [22]:
import os
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType
from pyspark.sql.functions import split, count, when, isnan, col, regexp_replace
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [23]:
spark = SparkSession.builder.appName('First Session').getOrCreate()

print('Spark Version: {}'.format(spark.version))

Spark Version: 3.5.0


In [29]:
schema = StructType([StructField('Jabatan', StringType(), nullable = True),
                     StructField('Perusahaan', StringType(), nullable = True),
                     StructField('Alamat', StringType(), nullable = True),
                     StructField('Gaji', StringType(), nullable = True),
                     StructField('Kategori', StringType(), nullable = True)])
file_path = 'jobstreet.csv'

df = spark.read.csv(file_path,
                    header = True,
                    inferSchema = True,
                    nanValue = '?')

df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|             Jabatan|          Perusahaan|              Alamat|                Gaji|            Kategori|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Fullstack Develop...|PT Karisma Zona K...|Jakarta Selatan, ...|Rp 10,000,000 – R...|subClassification...|
|Admin Sales & Mar...|    PT Bimaruna Jaya|Jakarta Selatan, ...|                NULL|subClassification...|
|Microsoft Dynamic...|PT SAGLOBAL INDON...|Jakarta Barat, Ja...|                NULL|subClassification...|
|Personal Assistan...|PT Crypto Canary ...| Tebet, Jakarta Raya|Rp 6,500,000 – Rp...|subClassification...|
|  Back End Developer|      PT Star Cosmos|Jakarta Barat, Ja...|                NULL|subClassification...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [30]:
def check_missing(dataframe):

    return dataframe.select([count(when(isnan(c) | col(c).isNull(), c)). \
                             alias(c) for c in dataframe.columns]).show()

check_missing(df)

+-------+----------+------+----+--------+
|Jabatan|Perusahaan|Alamat|Gaji|Kategori|
+-------+----------+------+----+--------+
|      0|        38|     0| 749|       0|
+-------+----------+------+----+--------+



In [31]:
df = df.na.drop()

df = df.withColumn("Perusahaan", df["Perusahaan"].cast(StringType()))

df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|             Jabatan|          Perusahaan|              Alamat|                Gaji|            Kategori|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Fullstack Develop...|PT Karisma Zona K...|Jakarta Selatan, ...|Rp 10,000,000 – R...|subClassification...|
|Personal Assistan...|PT Crypto Canary ...| Tebet, Jakarta Raya|Rp 6,500,000 – Rp...|subClassification...|
|Project Secretary...|PT Harrisma Infor...|Jakarta Barat, Ja...|Rp 14,000,000 – R...|subClassification...|
|   Staf Admin Gudang|PT.SUNGAI PANJANG...|Cikarang Utara, J...|Rp 3.500.000 – Rp...|subClassification...|
|          Programmer|PT Microvac Indon...|Jakarta Timur, Ja...|Rp 5,500,000 – Rp...|subClassification...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [32]:
df = df.na.drop()

df = df.withColumn("Gaji", df["Gaji"].cast(StringType()))

df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|             Jabatan|          Perusahaan|              Alamat|                Gaji|            Kategori|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Fullstack Develop...|PT Karisma Zona K...|Jakarta Selatan, ...|Rp 10,000,000 – R...|subClassification...|
|Personal Assistan...|PT Crypto Canary ...| Tebet, Jakarta Raya|Rp 6,500,000 – Rp...|subClassification...|
|Project Secretary...|PT Harrisma Infor...|Jakarta Barat, Ja...|Rp 14,000,000 – R...|subClassification...|
|   Staf Admin Gudang|PT.SUNGAI PANJANG...|Cikarang Utara, J...|Rp 3.500.000 – Rp...|subClassification...|
|          Programmer|PT Microvac Indon...|Jakarta Timur, Ja...|Rp 5,500,000 – Rp...|subClassification...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [33]:
df.columns

['Jabatan', 'Perusahaan', 'Alamat', 'Gaji', 'Kategori']

In [34]:
df.toPandas().head()

Unnamed: 0,Jabatan,Perusahaan,Alamat,Gaji,Kategori
0,Fullstack Developer (.Net + react.js),PT Karisma Zona Kreatifku,"Jakarta Selatan, Jakarta Raya","Rp 10,000,000 – Rp 12,000,000 per month",subClassification: Developer/ProgrammerDevelop...
1,Personal Assistant to the CEO,PT Crypto Canary Network,"Tebet, Jakarta Raya","Rp 6,500,000 – Rp 8,000,000 per month","subClassification: Asisten Pribadi, Asisten Ek..."
2,Project Secretary - Mandarin Speaker,PT Harrisma Informatika Jaya,"Jakarta Barat, Jakarta Raya","Rp 14,000,000 – Rp 20,000,000 per month","subClassification: Asisten Pribadi, Asisten Ek..."
3,Staf Admin Gudang,PT.SUNGAI PANJANG ADAMAS,"Cikarang Utara, Jawa Barat",Rp 3.500.000 – Rp 4.500.000 per month,subClassification: Entri Data & Pengolahan Kat...
4,Programmer,PT Microvac Indonesia,"Jakarta Timur, Jakarta Raya","Rp 5,500,000 – Rp 8,000,000 per month",subClassification: Developer/ProgrammerDevelop...


In [35]:
df.printSchema()

root
 |-- Jabatan: string (nullable = true)
 |-- Perusahaan: string (nullable = true)
 |-- Alamat: string (nullable = true)
 |-- Gaji: string (nullable = true)
 |-- Kategori: string (nullable = true)



In [37]:
for jobs in df.head(5):
    print(jobs, '\n')

Row(Jabatan='Fullstack Developer (.Net + react.js)', Perusahaan='PT Karisma Zona Kreatifku', Alamat='Jakarta Selatan, Jakarta Raya', Gaji='Rp\xa010,000,000 – Rp\xa012,000,000 per month', Kategori='subClassification: Developer/ProgrammerDeveloper/Programmerclassification: Teknologi Informasi & Komunikasi(Teknologi Informasi & Komunikasi)') 

Row(Jabatan='Personal Assistant to the CEO', Perusahaan='PT Crypto Canary Network', Alamat='Tebet, Jakarta Raya', Gaji='Rp\xa06,500,000 – Rp\xa08,000,000 per month', Kategori='subClassification: Asisten Pribadi, Asisten Eksekutif & SekretarialAsisten Pribadi, Asisten Eksekutif & Sekretarialclassification: Administrasi & Dukungan Perkantoran(Administrasi & Dukungan Perkantoran)') 

Row(Jabatan='Project Secretary - Mandarin Speaker', Perusahaan='PT Harrisma Informatika Jaya', Alamat='Jakarta Barat, Jakarta Raya', Gaji='Rp\xa014,000,000 – Rp\xa020,000,000 per month', Kategori='subClassification: Asisten Pribadi, Asisten Eksekutif & SekretarialAsisten P

In [38]:
df.describe().show()

+-------+--------+--------------------+--------------------+--------------------+--------------------+
|summary| Jabatan|          Perusahaan|              Alamat|                Gaji|            Kategori|
+-------+--------+--------------------+--------------------+--------------------+--------------------+
|  count|     357|                 357|                 357|                 357|                 357|
|   mean|    NULL|                NULL|                NULL|                NULL|                NULL|
| stddev|    NULL|                NULL|                NULL|                NULL|                NULL|
|    min|   ADMIN|Anugerah Dutanusa...|Bekasi Selatan, J...|IDR 4,000,000 - 4...|subClassification...|
|    max|中文助理|           rootcloud|   Teluknaga, Banten|Rp 9.000.000 – Rp...|subClassification...|
+-------+--------+--------------------+--------------------+--------------------+--------------------+



In [39]:
df.describe(['Perusahaan', 'Gaji']).show()

+-------+--------------------+--------------------+
|summary|          Perusahaan|                Gaji|
+-------+--------------------+--------------------+
|  count|                 357|                 357|
|   mean|                NULL|                NULL|
| stddev|                NULL|                NULL|
|    min|Anugerah Dutanusa...|IDR 4,000,000 - 4...|
|    max|           rootcloud|Rp 9.000.000 – Rp...|
+-------+--------------------+--------------------+



In [40]:
def get_num_cols(dataframe):

    num_cols = [col for col in dataframe.columns if dataframe.select(col). \
                dtypes[0][1] in ['double', 'int']]

    return num_cols

num_cols = get_num_cols(df)

df.describe(num_cols).show()

+-------+--------+--------------------+--------------------+--------------------+--------------------+
|summary| Jabatan|          Perusahaan|              Alamat|                Gaji|            Kategori|
+-------+--------+--------------------+--------------------+--------------------+--------------------+
|  count|     357|                 357|                 357|                 357|                 357|
|   mean|    NULL|                NULL|                NULL|                NULL|                NULL|
| stddev|    NULL|                NULL|                NULL|                NULL|                NULL|
|    min|   ADMIN|Anugerah Dutanusa...|Bekasi Selatan, J...|IDR 4,000,000 - 4...|subClassification...|
|    max|中文助理|           rootcloud|   Teluknaga, Banten|Rp 9.000.000 – Rp...|subClassification...|
+-------+--------+--------------------+--------------------+--------------------+--------------------+

