In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
import os


os.environ['PYSPARK_SUBMIT_ARGS'] = '--master local[*] --driver-memory 6g --executor-memory 6g --packages org.postgresql:postgresql:42.1.1 pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("title crew") \
    .getOrCreate()

# spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.app.id', 'local-1623000022957'),
 ('spark.sql.warehouse.dir', 'file:/home/zy/imdb-etl/spark-warehouse'),
 ('spark.app.startTime', '1623000022129'),
 ('spark.app.initial.file.urls',
  'file:///home/zy/.ivy2/jars/org.postgresql_postgresql-42.1.1.jar'),
 ('spark.driver.port', '36969'),
 ('spark.executor.id', 'driver'),
 ('spark.app.initial.jar.urls',
  'spark://192.168.0.188:36969/jars/org.postgresql_postgresql-42.1.1.jar'),
 ('spark.submit.pyFiles',
  '/home/zy/.ivy2/jars/org.postgresql_postgresql-42.1.1.jar'),
 ('spark.driver.host', '192.168.0.188'),
 ('spark.jars',
  'file:///home/zy/.ivy2/jars/org.postgresql_postgresql-42.1.1.jar'),
 ('spark.repl.local.jars',
  'file:///home/zy/.ivy2/jars/org.postgresql_postgresql-42.1.1.jar'),
 ('spark.app.name', 'title crew'),
 ('spark.driver.memory', '6g'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.deployMode', 'client'),
 ('spark.files',
  'file:///home/zy/

In [3]:
tsv_file = "title.crew.tsv"
df1 = spark.read.csv(tsv_file, sep=r'\t', header=True)
df1.head(5)

[Row(tconst='tt0000001', directors='nm0005690', writers='\\N'),
 Row(tconst='tt0000002', directors='nm0721526', writers='\\N'),
 Row(tconst='tt0000003', directors='nm0721526', writers='\\N'),
 Row(tconst='tt0000004', directors='nm0721526', writers='\\N'),
 Row(tconst='tt0000005', directors='nm0005690', writers='\\N')]

In [4]:
df1.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- directors: string (nullable = true)
 |-- writers: string (nullable = true)



In [10]:
df1.count()

23350548

In [5]:
tsv_file2 = "name.basics.tsv"
df2 = spark.read.csv(tsv_file2, sep=r'\t', header=True)
df2.head(5)

[Row(nconst='nm0000001', primaryName='Fred Astaire', birthYear='1899', deathYear='1987', primaryProfession='soundtrack,actor,miscellaneous', knownForTitles='tt0031983,tt0053137,tt0072308,tt0050419'),
 Row(nconst='nm0000002', primaryName='Lauren Bacall', birthYear='1924', deathYear='2014', primaryProfession='actress,soundtrack', knownForTitles='tt0037382,tt0038355,tt0117057,tt0071877'),
 Row(nconst='nm0000003', primaryName='Brigitte Bardot', birthYear='1934', deathYear='\\N', primaryProfession='actress,soundtrack,music_department', knownForTitles='tt0057345,tt0054452,tt0049189,tt0056404'),
 Row(nconst='nm0000004', primaryName='John Belushi', birthYear='1949', deathYear='1982', primaryProfession='actor,soundtrack,writer', knownForTitles='tt0080455,tt0072562,tt0077975,tt0078723'),
 Row(nconst='nm0000005', primaryName='Ingmar Bergman', birthYear='1918', deathYear='2007', primaryProfession='writer,director,actor', knownForTitles='tt0050976,tt0060827,tt0050986,tt0083922')]

In [6]:
df2.printSchema()

root
 |-- nconst: string (nullable = true)
 |-- primaryName: string (nullable = true)
 |-- birthYear: string (nullable = true)
 |-- deathYear: string (nullable = true)
 |-- primaryProfession: string (nullable = true)
 |-- knownForTitles: string (nullable = true)



In [14]:
# show how many missing values for each column
df.select(*(F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

+-------+--------+-----+------+--------+-----+----------+---------------+
|titleId|ordering|title|region|language|types|attributes|isOriginalTitle|
+-------+--------+-----+------+--------+-----+----------+---------------+
|      0|       0|    0|     1|       1|    1|         1|              1|
+-------+--------+-----+------+--------+-----+----------+---------------+



In [16]:
df.filter(df.region.isNull()).show()

+---------+--------+------------------------------+------+--------+-----+----------+---------------+
|  titleId|ordering|                         title|region|language|types|attributes|isOriginalTitle|
+---------+--------+------------------------------+------+--------+-----+----------+---------------+
|tt3880980|       1|マックのハッスル刑事	JP	ja	...|  null|    null| null|      null|           null|
+---------+--------+------------------------------+------+--------+-----+----------+---------------+



In [13]:
df1 = df1.withColumn('directors', F.explode(F.split('directors', ',')))
df1 = df1.withColumn('writers', F.explode(F.split('writers', ',')))

df1.count()

23350548

In [None]:
df1_writers = df1.withColumn('nconst', F.when(F.col('writers') != '\\N', F.col("writers"))) \
    .withColumn('is_writer', F.when(F.col('writers') != '\\N', F.lit(True)).otherwise(F.lit(False))).drop('writers') \
    .filter(F.col('nconst').isNotNull()).drop('directors')

df1_directors = df1.withColumn('nconst', F.when(F.col('directors') != '\\N', F.col("directors"))) \
    .withColumn('is_director', F.when(F.col('directors') != '\\N', F.lit(True)).otherwise(F.lit(False))).drop('directors') \
    .filter(F.col('nconst').isNotNull()).drop('writers')

df1_writers.show()
df1_directors.show()    

In [8]:
df1_writers.count()

19251658

In [9]:
df1_directors.count()

19497370

In [11]:
# df1_writers.join(df1_directors, (df1_writers.nconst == df1_directors.nconst) & (df1_writers.tconst == df1_directors.tconst)).select('df1_writers.tconst','df1_directors.is_director').show(5)
df_joined = df1_writers.join(df1_directors, ['nconst','tconst'], how='full')
df_joined.show()

# then replace nulls with false

+---------+---------+---------+-----------+
|   nconst|   tconst|is_writer|is_director|
+---------+---------+---------+-----------+
|nm0000024|tt0926561|     true|       null|
|nm0000033|tt7700220|     true|       null|
|nm0000054|tt5149714|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true|       null|
|nm0000092|tt0233023|     true| 

In [12]:
df_joined.count()

334327527

In [14]:
df_joined_deduped = df_joined.drop_duplicates(subset=['nconst', 'tconst'])
df_joined_deduped.count()

13799810

In [15]:
df_joined_deduped.count()

Py4JJavaError: An error occurred while calling o254.showString.
: org.apache.spark.SparkException: Job 13 cancelled 
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2154)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2404)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [101]:
df2 = df2.withColumnRenamed('primaryName', 'name')
df2 = df2.withColumn('age', ( df2['deathYear'].cast(IntegerType()) - df2['birthYear'].cast(IntegerType()) ))

is_alive_col = F.when(
    (F.col("birthYear") != '\\N') & (F.col("deathYear") != '\\N'), False
).when((F.col("birthYear") != '\\N') & (F.col("deathYear") == '\\N'), True).otherwise(None)

df2 = df2.withColumn('is_alive', is_alive_col)

df2 = df2.drop('primaryProfession', 'knownForTitles', 'birthYear', 'deathYear')
df2.show()

+---------+-------------------+----+--------+
|   nconst|               name| age|is_alive|
+---------+-------------------+----+--------+
|nm0000001|       Fred Astaire|  88|   false|
|nm0000002|      Lauren Bacall|  90|   false|
|nm0000003|    Brigitte Bardot|null|    true|
|nm0000004|       John Belushi|  33|   false|
|nm0000005|     Ingmar Bergman|  89|   false|
|nm0000006|     Ingrid Bergman|  67|   false|
|nm0000007|    Humphrey Bogart|  58|   false|
|nm0000008|      Marlon Brando|  80|   false|
|nm0000009|     Richard Burton|  59|   false|
|nm0000010|       James Cagney|  87|   false|
|nm0000011|        Gary Cooper|  60|   false|
|nm0000012|        Bette Davis|  81|   false|
|nm0000013|          Doris Day|  97|   false|
|nm0000014|Olivia de Havilland| 104|   false|
|nm0000015|         James Dean|  24|   false|
|nm0000016|    Georges Delerue|  67|   false|
|nm0000017|   Marlene Dietrich|  91|   false|
|nm0000018|       Kirk Douglas| 104|   false|
|nm0000019|   Federico Fellini|  7

In [102]:
df_final = df_joined_deduped.join(df2, ['nconst'])
df_final.show()

+----------+---------+---------+--------------+----+--------+
|    tconst|  writers|   nconst|          name| age|is_alive|
+----------+---------+---------+--------------+----+--------+
| tt0078813|nm0596942|nm0000086|Louis de Funès|  69|   false|
| tt0078813|nm0000086|nm0000086|Louis de Funès|  69|   false|
| tt0078813|nm0320833|nm0000086|Louis de Funès|  69|   false|
| tt0119792|nm0000198|nm0000198|   Gary Oldman|null|    true|
|tt14056430|       \N|nm0000198|   Gary Oldman|null|    true|
+----------+---------+---------+--------------+----+--------+
only showing top 5 rows

