In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode
from pyspark.sql.types import IntegerType

In [5]:
spark = SparkSession.builder \
    .master("local") \
    .appName("name basics tsv") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [7]:
name_basics_tsv = "name.basics.tsv"
df = spark.read.csv(name_basics_tsv, sep=r'\t', header=True)
df.head(5)

[Row(nconst='nm0000001', primaryName='Fred Astaire', birthYear='1899', deathYear='1987', primaryProfession='soundtrack,actor,miscellaneous', knownForTitles='tt0072308,tt0053137,tt0050419,tt0031983'),
 Row(nconst='nm0000002', primaryName='Lauren Bacall', birthYear='1924', deathYear='2014', primaryProfession='actress,soundtrack', knownForTitles='tt0038355,tt0037382,tt0117057,tt0071877'),
 Row(nconst='nm0000003', primaryName='Brigitte Bardot', birthYear='1934', deathYear='\\N', primaryProfession='actress,soundtrack,music_department', knownForTitles='tt0049189,tt0054452,tt0056404,tt0057345'),
 Row(nconst='nm0000004', primaryName='John Belushi', birthYear='1949', deathYear='1982', primaryProfession='actor,soundtrack,writer', knownForTitles='tt0072562,tt0077975,tt0080455,tt0078723'),
 Row(nconst='nm0000005', primaryName='Ingmar Bergman', birthYear='1918', deathYear='2007', primaryProfession='writer,director,actor', knownForTitles='tt0060827,tt0050986,tt0083922,tt0050976')]

In [15]:
df.withColumn('primaryProfession', explode(split('primaryProfession', ','))).show(5)

+---------+-------------+---------+---------+-----------------+--------------------+
|   nconst|  primaryName|birthYear|deathYear|primaryProfession|      knownForTitles|
+---------+-------------+---------+---------+-----------------+--------------------+
|nm0000001| Fred Astaire|     1899|     1987|       soundtrack|tt0072308,tt00531...|
|nm0000001| Fred Astaire|     1899|     1987|            actor|tt0072308,tt00531...|
|nm0000001| Fred Astaire|     1899|     1987|    miscellaneous|tt0072308,tt00531...|
|nm0000002|Lauren Bacall|     1924|     2014|          actress|tt0038355,tt00373...|
|nm0000002|Lauren Bacall|     1924|     2014|       soundtrack|tt0038355,tt00373...|
+---------+-------------+---------+---------+-----------------+--------------------+
only showing top 5 rows



In [16]:
df.withColumn('knownForTitles', explode(split('knownForTitles', ','))).show(5)

+---------+-------------+---------+---------+--------------------+--------------+
|   nconst|  primaryName|birthYear|deathYear|   primaryProfession|knownForTitles|
+---------+-------------+---------+---------+--------------------+--------------+
|nm0000001| Fred Astaire|     1899|     1987|soundtrack,actor,...|     tt0072308|
|nm0000001| Fred Astaire|     1899|     1987|soundtrack,actor,...|     tt0053137|
|nm0000001| Fred Astaire|     1899|     1987|soundtrack,actor,...|     tt0050419|
|nm0000001| Fred Astaire|     1899|     1987|soundtrack,actor,...|     tt0031983|
|nm0000002|Lauren Bacall|     1924|     2014|  actress,soundtrack|     tt0038355|
+---------+-------------+---------+---------+--------------------+--------------+
only showing top 5 rows



In [22]:
df.withColumn('age', ( df.deathYear.cast(IntegerType()) - df.birthYear.cast(IntegerType()) )).show(5)

+---------+---------------+---------+---------+--------------------+--------------------+----+
|   nconst|    primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles| age|
+---------+---------------+---------+---------+--------------------+--------------------+----+
|nm0000001|   Fred Astaire|     1899|     1987|soundtrack,actor,...|tt0072308,tt00531...|  88|
|nm0000002|  Lauren Bacall|     1924|     2014|  actress,soundtrack|tt0038355,tt00373...|  90|
|nm0000003|Brigitte Bardot|     1934|       \N|actress,soundtrac...|tt0049189,tt00544...|null|
|nm0000004|   John Belushi|     1949|     1982|actor,soundtrack,...|tt0072562,tt00779...|  33|
|nm0000005| Ingmar Bergman|     1918|     2007|writer,director,a...|tt0060827,tt00509...|  89|
+---------+---------------+---------+---------+--------------------+--------------------+----+
only showing top 5 rows



In [36]:
# is alive
df.where((df.birthYear != '\\N') & (df.deathYear == '\\N')).show(5)

+---------+---------------+---------+---------+--------------------+--------------------+
|   nconst|    primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+---------------+---------+---------+--------------------+--------------------+
|nm0000003|Brigitte Bardot|     1934|       \N|actress,soundtrac...|tt0049189,tt00544...|
|nm0000047|   Sophia Loren|     1934|       \N|  actress,soundtrack|tt0076085,tt00601...|
|nm0000079|   Raquel Welch|     1940|       \N|actress,soundtrac...|tt0066115,tt00627...|
|nm0000084|        Gong Li|     1965|       \N|             actress|tt0430357,tt03975...|
|nm0000085| Henner Hofmann|     1950|       \N|cinematographer,p...|tt0113482,tt18257...|
+---------+---------------+---------+---------+--------------------+--------------------+
only showing top 5 rows



In [37]:
# unknown
df.where((df.birthYear == '\\N') & (df.deathYear == '\\N')).show(5)

+---------+------------+---------+---------+--------------------+--------------------+
|   nconst| primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+------------+---------+---------+--------------------+--------------------+
|nm0000083| Alan Miller|       \N|       \N|editor,writer,dir...|tt0969216,tt04247...|
|nm0000094|   J. Reifel|       \N|       \N|              writer|tt0118631,tt01179...|
|nm0000647|Alan Smithee|       \N|       \N|director,actor,wr...|tt0116514,tt00864...|
|nm0000712| Steve Cohen|       \N|       \N|assistant_directo...|tt0363685,tt00892...|
|nm0001269|Cynthia Gibb|       \N|       \N|  actress,soundtrack|tt0091886,tt00993...|
+---------+------------+---------+---------+--------------------+--------------------+
only showing top 5 rows



In [38]:
# is not alive
df.where((df.birthYear != '\\N') & (df.deathYear != '\\N')).show(5)

+---------+--------------+---------+---------+--------------------+--------------------+
|   nconst|   primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+--------------+---------+---------+--------------------+--------------------+
|nm0000001|  Fred Astaire|     1899|     1987|soundtrack,actor,...|tt0072308,tt00531...|
|nm0000002| Lauren Bacall|     1924|     2014|  actress,soundtrack|tt0038355,tt00373...|
|nm0000004|  John Belushi|     1949|     1982|actor,soundtrack,...|tt0072562,tt00779...|
|nm0000005|Ingmar Bergman|     1918|     2007|writer,director,a...|tt0060827,tt00509...|
|nm0000006|Ingrid Bergman|     1915|     1982|actress,soundtrac...|tt0034583,tt00381...|
+---------+--------------+---------+---------+--------------------+--------------------+
only showing top 5 rows

