In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType
import datetime
import os

master = "spark://zy-ubuntu:7077"  
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--master {master} --driver-memory 4g --total-executor-cores 6 --executor-memory 8g --packages org.postgresql:postgresql:42.1.1 pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .appName("dim casts") \
    .getOrCreate()

In [3]:
title_principals_df = spark.read.csv("title.principals.tsv", sep=r'\t', header=True)
title_principals_df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- ordering: string (nullable = true)
 |-- nconst: string (nullable = true)
 |-- category: string (nullable = true)
 |-- job: string (nullable = true)
 |-- characters: string (nullable = true)



In [4]:
title_principals_df = title_principals_df.drop('ordering').drop('job').drop('category')

title_principals_df = title_principals_df.withColumn("characters", F.regexp_replace(F.col("characters"), '[\[\]\"]', "").alias("replaced"))
title_principals_df = title_principals_df.withColumn('characters', F.explode(F.split('characters', ',')))
title_principals_df = title_principals_df.withColumnRenamed('characters', 'character')
title_casts_df = title_principals_df.filter(
        (F.col('category') == 'self') | 
        (F.col('category') == 'actor') | 
        (F.col('category') == 'actress'))
# title_casts_df.show(truncate=False)   

+---------+---------+--------------------------------------------------------------+
|tconst   |nconst   |character                                                     |
+---------+---------+--------------------------------------------------------------+
|tt0000001|nm1588970|Self                                                          |
|tt0000005|nm0443482|Blacksmith                                                    |
|tt0000005|nm0653042|Assistant                                                     |
|tt0000007|nm0179163|\N                                                            |
|tt0000007|nm0183947|\N                                                            |
|tt0000008|nm0653028|Sneezing Man                                                  |
|tt0000009|nm0063086|Miss Geraldine Holbrook (Miss Jerry)                          |
|tt0000009|nm0183823|Mr. Hamilton                                                  |
|tt0000009|nm1309758|Chauncey Depew - the Director of the New Yor

In [5]:
# read tsv file into df
name_basics_df = spark.read.csv("name.basics.tsv", sep=r'\t', header=True)

# rename column
name_basics_df = name_basics_df.withColumnRenamed('primaryName', 'name')

# calculate age from birth year and death year
name_basics_df = name_basics_df.withColumn('age', ( F.col('deathYear').cast(IntegerType()) - F.col('birthYear').cast(IntegerType()) ))

# create is_alive column based on conditions
is_alive_col = F.when(
    (F.col("birthYear") != '\\N') & (F.col("deathYear") != '\\N'), False
).when((F.col("birthYear") != '\\N') & (F.col("deathYear") == '\\N'), True).otherwise(None)

name_basics_df = name_basics_df.withColumn('is_alive', is_alive_col)

# drop unused columns
name_basics_df_dropped = name_basics_df.drop('primaryProfession', 'knownForTitles', 'birthYear', 'deathYear')

In [6]:
df_casts = title_casts_df.join(name_basics_df_dropped, ['nconst'])
df_casts = df_casts.withColumn('character', F.when(F.col('character') == '\\N', F.lit(None)).otherwise(F.col('character')))
# df_casts.show()

In [7]:
df_upload = df_casts.drop('tconst')
# df_upload.show()

In [8]:
# insert df into dim_casts table
df_upload.write.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='dim_casts',
      user='admin',
      password='password'
      ).mode('append').save()

In [9]:
title_desc_df = spark.read.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='dim_title_desc',
      user='admin',
      password='password'
      ).load()
title_desc_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- tconst: string (nullable = true)
 |-- type: string (nullable = true)
 |-- primary_title: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- is_adult: boolean (nullable = true)
 |-- start_year: short (nullable = true)
 |-- end_year: short (nullable = true)
 |-- runtime_minutes: short (nullable = true)
 |-- av_rating: float (nullable = true)
 |-- num_votes: integer (nullable = true)
 |-- genre_1: string (nullable = true)
 |-- genre_2: string (nullable = true)
 |-- genre_3: string (nullable = true)



In [19]:
title_desc_id_df = title_desc_df.select('id', 'tconst')
title_desc_id_df = title_desc_id_df.withColumnRenamed('id', 'title_id')
title_desc_id_df.show()

+--------+----------+
|title_id|    tconst|
+--------+----------+
|  503747|tt10918034|
|  503753| tt1091911|
|  503760| tt1092006|
|  503768|tt10921098|
|  503775|tt10922526|
|  503783|tt10923786|
|  503790| tt1092631|
|  503797|tt10927160|
|  503804|tt10930466|
|  503810|tt10930772|
|  503819| tt1093524|
|  503825| tt1093577|
|  503832|tt10937558|
|  503839|tt10937888|
|  503844| tt1093927|
|  503852|tt10940834|
|  503859|tt10945092|
|  503867|tt10946648|
|  503873| tt1095402|
|  503879| tt1097256|
+--------+----------+
only showing top 20 rows



In [20]:
df_casts.printSchema()
title_desc_id_df.printSchema()

root
 |-- nconst: string (nullable = true)
 |-- tconst: string (nullable = true)
 |-- character: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_alive: boolean (nullable = true)

root
 |-- title_id: integer (nullable = true)
 |-- tconst: string (nullable = true)



In [12]:
casts_from_pg_df = spark.read.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='dim_casts',
      user='admin',
      password='password'
      ).load()
casts_from_pg_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- character: string (nullable = true)
 |-- age: short (nullable = true)
 |-- is_alive: boolean (nullable = true)
 |-- nconst: string (nullable = true)



In [21]:
casts_id_df = casts_from_pg_df.select('id', 'nconst')
casts_id_df = casts_id_df.withColumnRenamed('id', 'cast_id')

In [22]:
df_casts_with_tconst = df_casts.select('nconst', 'tconst')

In [24]:
df_tmp = casts_id_df.join(df_casts_with_tconst, ['nconst'])
df_titles_casts = df_tmp.join(title_desc_id_df, ['tconst'])
# df_tmp_2 = df_tmp_2.drop('nconst').drop('tconst')
df_titles_casts.show()

Py4JJavaError: An error occurred while calling o615.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 103 in stage 16.0 failed 4 times, most recent failure: Lost task 103.3 in stage 16.0 (TID 395) (192.168.0.188 executor 0): java.io.FileNotFoundException: /tmp/spark-f33290fa-c67d-4d06-afec-d61527a2c87c/executor-8914d0ce-c881-4dc2-bf6e-52fb6ccb1572/blockmgr-37468861-df71-4d0f-8509-29f82b2fb55b/1e/temp_shuffle_2e4ea0ab-307a-44ef-8f2f-9423ed54b2a8 (No space left on device)
	at java.io.FileOutputStream.open0(Native Method)
	at java.io.FileOutputStream.open(FileOutputStream.java:270)
	at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
	at org.apache.spark.storage.DiskBlockObjectWriter.initialize(DiskBlockObjectWriter.scala:105)
	at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:118)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:245)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:158)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: /tmp/spark-f33290fa-c67d-4d06-afec-d61527a2c87c/executor-8914d0ce-c881-4dc2-bf6e-52fb6ccb1572/blockmgr-37468861-df71-4d0f-8509-29f82b2fb55b/1e/temp_shuffle_2e4ea0ab-307a-44ef-8f2f-9423ed54b2a8 (No space left on device)
	at java.io.FileOutputStream.open0(Native Method)
	at java.io.FileOutputStream.open(FileOutputStream.java:270)
	at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
	at org.apache.spark.storage.DiskBlockObjectWriter.initialize(DiskBlockObjectWriter.scala:105)
	at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:118)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:245)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:158)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
# insert df into titles_casts table
df_titles_casts.write.format('jdbc').options(
      url='jdbc:postgresql://localhost:5433/imdb',
      driver='org.postgresql.Driver',
      dbtable='titles_casts',
      user='admin',
      password='password'
      ).mode('append').save()