In [1]:
from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import udf
import os
import json
import sqlalchemy
from sqlalchemy import create_engine
import pandas as pd
import psycopg2

# Values

In [13]:
WIKIPEDIA_DUMP_XML_PATH="enwiki-latest-abstract.xml"
WIKIPEDIA_DUMP_SCHEMA_PATH="xml_schema.json"
MOVIES_METADATA_CSV_PATH="movies_metadata.csv"
TOP100_PARQUET_PATH="ouput/top100byratio_wiki.parquet"
TOP100_CSV_PATH="ouput/top100byratio_wiki.csv"

In [14]:
POSTGRES_HOST="postgres-db"
POSTGRES_PWD="postgres"
POSTGRES_USER="postgres"
POSTGRES_PORT="5432"
POSTGRES_DB="movies"
POSTGRES_TABLE="top100byRatio"
POSTGRES_CONNECTION_STRING="postgresql://{user}:{password}@{host}:{port}/{db}".format(user=POSTGRES_USER,
                                                                                      password=POSTGRES_PWD,
                                                                                      host=POSTGRES_HOST,
                                                                                      port=POSTGRES_PORT,
                                                                                      db=POSTGRES_DB)
JDBC_CONNECTION_STRING="jdbc:postgresql://{host}:{port}/{db}".format(host=POSTGRES_HOST,
                                                                     port=POSTGRES_PORT,
                                                                     db=POSTGRES_DB)

In [10]:
xml_schema_file=open(WIKIPEDIA_DUMP_SCHEMA_PATH)
WIKIPEDIA_DUMP_XML_SCHEMA=StructType.fromJson(json.load(xml_schema_file))
xml_schema_file.close()
WIKIPEDIA_DUMP_XML_SCHEMA

StructType(List(StructField(abstract,StringType,true),StructField(links,StringType,true),StructField(title,StringType,true),StructField(url,StringType,true)))

In [11]:
postgres_connection_properties = {
     "url" : JDBC_CONNECTION_STRING,
     "table" : POSTGRES_TABLE,
     "user" : POSTGRES_USER,
     "password" : POSTGRES_PWD,
     "driver" : 'org.postgresql.Driver',

}

# Download Files

In [6]:
!curl -o enwiki-latest-abstract.xml.gz https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract.xml.gz

In [7]:
!gzip -d enwiki-latest-abstract.xml.gz

In [8]:
os.path.isfile(WIKIPEDIA_DUMP_XML_PATH)

True

In [9]:
os.path.isfile(MOVIES_METADATA_CSV_PATH)

True

# Functions

# Process

### Read film metadata and select fields of our interest

In [None]:
spark = SparkSession \
    .builder \
    .getOrCreate()
spark

In [10]:
metadata_df=spark.read.csv(MOVIES_METADATA_CSV_PATH, header=True)
metadata_df.show()

+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+--------+--------------------+-----------------+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|        release_date|             revenue|             runtime|    spoken_languages|  status|             tagline|               title|   video|        vote_average|       vote_count|
+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+-----

##### field 'year' will be obtained from 'release_date'
##### field 'vote_average' will be renamed 'rating' as requested

In [11]:
budget_revenue_view=metadata_df.select("id","imdb_id","original_title","budget","revenue","release_date", "vote_average", "production_companies")
budget_revenue_view.show()

+-----+---------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+
|   id|  imdb_id|      original_title|  budget|             revenue|        release_date|        vote_average|production_companies|
+-----+---------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+
|  862|tt0114709|           Toy Story|30000000|           373554033|          1995-10-30|                 7.7|[{'name': 'Pixar ...|
| 8844|tt0113497|             Jumanji|65000000|           262797249|          1995-12-15|                 6.9|[{'name': 'TriSta...|
|15602|tt0113228|    Grumpier Old Men|       0|                   0|          1995-12-22|                 6.5|[{'name': 'Warner...|
|31357|tt0114885|   Waiting to Exhale|16000000|[{'name': 'Twenti...|/16XOMpEaLWkrcPqS...|Friends are the p...| determined to fi...|
|11862|tt0113041|Father of the Bri...|       0|            76578911|        

In [12]:
budget_revenue_view.count()

45572

##### Here is a list of records that are being discarded due to their invalid fields 'id', 'imdb_id', 'revenue', 'budget'
##### Here a regex is used to intercept invalid imdb_id records

In [13]:
budget_revenue_view.where(F.regexp_extract(F.col("imdb_id"),r'tt[0-9]{7}',0)=="").show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|             imdb_id|      original_title|              budget|             revenue|        release_date|        vote_average|production_companies|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          1995-01-01|                   0|                  []|/uUi23HjvDFYGfuVl...|                null|                   0|                null|               False|
|                  []|[{'iso_3166_1': '...|                   0| 2- A Soft Touch ...|                 6.4|               False|                null|                null|
|[{'id': 18, 'name...|http://www.wkw-in...|           tt0118694| 'poster_path': '...|[{'iso_3166_1': '...|[{'name': 'Block ...|In the Mood for Love|  

##### Removing records with invalid 'id' and 'imdb_id'

In [14]:
clean_IDs=budget_revenue_view.where(F.regexp_extract(F.col("imdb_id"),r'tt[0-9]{7}',0)!="").withColumn("id", F.col("id").cast(IntegerType())).where(F.col("id").isNotNull())
clean_IDs.show()

+-----+---------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+
|   id|  imdb_id|      original_title|  budget|             revenue|        release_date|        vote_average|production_companies|
+-----+---------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+
|  862|tt0114709|           Toy Story|30000000|           373554033|          1995-10-30|                 7.7|[{'name': 'Pixar ...|
| 8844|tt0113497|             Jumanji|65000000|           262797249|          1995-12-15|                 6.9|[{'name': 'TriSta...|
|15602|tt0113228|    Grumpier Old Men|       0|                   0|          1995-12-22|                 6.5|[{'name': 'Warner...|
|31357|tt0114885|   Waiting to Exhale|16000000|[{'name': 'Twenti...|/16XOMpEaLWkrcPqS...|Friends are the p...| determined to fi...|
|11862|tt0113041|Father of the Bri...|       0|            76578911|        

##### Casting columns to our desired dataType: all invalid values are being replaced by 'null' and invalid values of revenue or budget are copied to the new field 'note'
##### the field 'note' is used to flag any potential data type mismatch that can lead to wrong ratio computation or errors
##### the field 'year' is generated from field 'release_date', that is previously casted to dateType
##### field 'vote_average' has been renamed 'rating'

In [15]:
budget_revenue_view_cast=clean_IDs.withColumn("note",F.when(budget_revenue_view.revenue.cast(IntegerType()).isNotNull()==False,budget_revenue_view.revenue).otherwise(F.lit(None))) \
                                    .withColumn("note",F.when(budget_revenue_view.budget.cast(IntegerType()).isNotNull()==False,budget_revenue_view.budget).otherwise(F.col("note"))) \
                                    .withColumn("budget",budget_revenue_view.budget.cast(IntegerType())) \
                                    .withColumn("revenue",budget_revenue_view.revenue.cast(IntegerType())) \
                                    .withColumn("note",F.when((F.col("revenue")==0),"Budget and/or revenue data missing").otherwise(F.col("note"))) \
                                    .withColumn("note",F.when((F.col("budget")==0),"Budget and/or revenue data missing").otherwise(F.col("note"))) \
                                    .withColumn("release_date",budget_revenue_view.release_date.cast(DateType())) \
                                    .withColumn("year",(F.year("release_date"))) \
                                    .withColumn("vote_average",budget_revenue_view.vote_average.cast(DoubleType())) \
                                    .withColumnRenamed("vote_average","rating") \
                                    .drop("release_date")
budget_revenue_view_cast.show()

+-----+---------+--------------------+--------+---------+------+--------------------+--------------------+----+
|   id|  imdb_id|      original_title|  budget|  revenue|rating|production_companies|                note|year|
+-----+---------+--------------------+--------+---------+------+--------------------+--------------------+----+
|  862|tt0114709|           Toy Story|30000000|373554033|   7.7|[{'name': 'Pixar ...|                null|1995|
| 8844|tt0113497|             Jumanji|65000000|262797249|   6.9|[{'name': 'TriSta...|                null|1995|
|15602|tt0113228|    Grumpier Old Men|       0|        0|   6.5|[{'name': 'Warner...|Budget and/or rev...|1995|
|31357|tt0114885|   Waiting to Exhale|16000000|     null|  null| determined to fi...|[{'name': 'Twenti...|null|
|11862|tt0113041|Father of the Bri...|       0| 76578911|   5.7|[{'name': 'Sandol...|Budget and/or rev...|1995|
|  949|tt0113277|                Heat|60000000|187436818|   7.7|[{'name': 'Regenc...|                nul

##### Computing revenue/budget ratio

In [16]:
budget_revenue_ratio=budget_revenue_view_cast.withColumn("ratio",F.when(budget_revenue_view_cast.note.isNull(),F.format_number(budget_revenue_view_cast.revenue/budget_revenue_view_cast.budget,2)).otherwise(F.lit(0)).cast(DoubleType()))
budget_revenue_ratio.show()

+-----+---------+--------------------+--------+---------+------+--------------------+--------------------+----+-----+
|   id|  imdb_id|      original_title|  budget|  revenue|rating|production_companies|                note|year|ratio|
+-----+---------+--------------------+--------+---------+------+--------------------+--------------------+----+-----+
|  862|tt0114709|           Toy Story|30000000|373554033|   7.7|[{'name': 'Pixar ...|                null|1995|12.45|
| 8844|tt0113497|             Jumanji|65000000|262797249|   6.9|[{'name': 'TriSta...|                null|1995| 4.04|
|15602|tt0113228|    Grumpier Old Men|       0|        0|   6.5|[{'name': 'Warner...|Budget and/or rev...|1995|  0.0|
|31357|tt0114885|   Waiting to Exhale|16000000|     null|  null| determined to fi...|[{'name': 'Twenti...|null|  0.0|
|11862|tt0113041|Father of the Bri...|       0| 76578911|   5.7|[{'name': 'Sandol...|Budget and/or rev...|1995|  0.0|
|  949|tt0113277|                Heat|60000000|187436818

In [17]:
budget_revenue_ratio.count()

45339

##### Ordering records by ratio, descending order. Top100 films

In [18]:
# Select top100 movies by ratio
top100_by_ratio=budget_revenue_ratio.orderBy("ratio", ascending=False) \
.withColumn("clean_title", F.lower(F.col("original_title"))) \
.limit(100)

### Read data from Wikipedia dump

In [39]:
from_wikipedia_df = spark.read.format("com.databricks.spark.xml").option("rootTag", "feed").option("rowTag", "doc").load(WIKIPEDIA_DUMP_XML_PATH, schema=WIKIPEDIA_DUMP_XML_SCHEMA)
from_wikipedia_df.count()

0

##### Cleaning film titles to prepare for join step
##### Since 'imdb_id' and 'id' fields are not available from the Wikipedia dump, 'title' will be used as primary key for join

In [20]:
clean_from_wikipedia_df=from_wikipedia_df.select("title", "url", "abstract") \
                                         .withColumn("extracted_title", F.regexp_extract(F.col("title"),r'Wikipedia: (.*)',1)) \
                                         .withColumn("clean_title", F.lower(F.col("extracted_title"))) \
                                         .select("clean_title", "url", "abstract")
clean_from_wikipedia_df.show()

+-----------+---+--------+
|clean_title|url|abstract|
+-----------+---+--------+
+-----------+---+--------+



### Enrich IMDB data

In [21]:
top100_enriched=top100_by_ratio.join(clean_from_wikipedia_df, 'clean_title',"left")
top100_enriched.show()

+--------------------+------+---------+--------------------+-------+---------+------+--------------------+----+----+------+----+--------+
|         clean_title|    id|  imdb_id|      original_title| budget|  revenue|rating|production_companies|note|year| ratio| url|abstract|
+--------------------+------+---------+--------------------+-------+---------+------+--------------------+----+----+------+----+--------+
|        house of wax| 18573|tt0045888|        House of Wax| 658000| 23800000|   7.0|[{'name': 'Bryan ...|null|1953| 36.17|null|    null|
|    enter the dragon|  9461|tt0070034|    Enter the Dragon| 850000| 90000000|   7.3|[{'name': 'Golden...|null|1973|105.88|null|    null|
|one flew over the...|   510|tt0073486|One Flew Over the...|3000000|108981275|   8.3|[{'name': 'United...|null|1975| 36.33|null|    null|
|                  pi|   473|tt0138704|                  Pi|  60000|  3221152|   7.1|[{'name': 'Truth ...|null|1998| 53.69|null|    null|
|the birth of a na...|   618|tt000

### Save enriched data [optional]

In [22]:
top100_enriched.drop("clean_title", "id", "imdb_id", "release_date", "note") \
.write \
.mode("overwrite") \
.parquet(TOP100_PARQUET_PATH)

In [23]:
top100_enriched.drop("clean_title", "id", "imdb_id", "release_date", "note") \
.write \
.options(header='True', delimiter='|') \
.mode("overwrite") \
.csv(TOP100_CSV_PATH)

##### Create database 'movies'

In [None]:
conn = psycopg2.connect(
    user=POSTGRES_USER,
    password=POSTGRES_PWD,
    host=POSTGRES_HOST,
    port= POSTGRES_PORT
)

In [None]:
conn.autocommit = True
cursor = conn.cursor()
cursor.execute("SELECT 'CREATE DATABASE {db}' WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = ' {db}')".format(db=POSTGRES_DB))
print("Database has been created");
conn.close()

## Write enriched data to Postgres

In [16]:
top100byratio_wiki_df=spark.read.parquet(TOP100_PARQUET_PATH, header=True)

##### Write parquet data to Postgres

In [22]:
top100byratio_wiki_df.write \
                     .mode('overwrite') \
                     .jdbc(
                        url=postgres_connection_properties["url"],
                        table=postgres_connection_properties["table"],
                        properties=postgres_connection_properties
                         )

Py4JJavaError: An error occurred while calling o58.jdbc.
: org.postgresql.util.PSQLException: The connection attempt failed.
	at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:331)
	at org.postgresql.core.ConnectionFactory.openConnection(ConnectionFactory.java:49)
	at org.postgresql.jdbc.PgConnection.<init>(PgConnection.java:247)
	at org.postgresql.Driver.makeConnection(Driver.java:434)
	at org.postgresql.Driver.connect(Driver.java:291)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$createConnectionFactory$1(JdbcUtils.scala:64)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:48)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:122)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:121)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:963)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:963)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:399)
	at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:791)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.UnknownHostException: postgres-db
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:196)
	at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:162)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:394)
	at java.net.Socket.connect(Socket.java:606)
	at org.postgresql.core.PGStream.createSocket(PGStream.java:241)
	at org.postgresql.core.PGStream.<init>(PGStream.java:98)
	at org.postgresql.core.v3.ConnectionFactoryImpl.tryConnect(ConnectionFactoryImpl.java:109)
	at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:235)
	... 38 more


##### Proof read data from Postgres

In [18]:
top100byratio_postgres = spark.read.jdbc(
    url=postgres_connection_properties["url"],
    table=postgres_connection_properties["table"],
    properties=postgres_connection_properties
)

Py4JJavaError: An error occurred while calling o47.jdbc.
: org.postgresql.util.PSQLException: The connection attempt failed.
	at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:331)
	at org.postgresql.core.ConnectionFactory.openConnection(ConnectionFactory.java:49)
	at org.postgresql.jdbc.PgConnection.<init>(PgConnection.java:247)
	at org.postgresql.Driver.makeConnection(Driver.java:434)
	at org.postgresql.Driver.connect(Driver.java:291)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$createConnectionFactory$1(JdbcUtils.scala:64)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:56)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation$.getSchema(JDBCRelation.scala:226)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:35)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:344)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:286)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:221)
	at org.apache.spark.sql.DataFrameReader.jdbc(DataFrameReader.scala:312)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.UnknownHostException: postgres-db
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:196)
	at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:162)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:394)
	at java.net.Socket.connect(Socket.java:606)
	at org.postgresql.core.PGStream.createSocket(PGStream.java:241)
	at org.postgresql.core.PGStream.<init>(PGStream.java:98)
	at org.postgresql.core.v3.ConnectionFactoryImpl.tryConnect(ConnectionFactoryImpl.java:109)
	at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:235)
	... 26 more


In [28]:
top100byratio_postgres.show()

+--------------------+--------+---------+------+--------------------+----+------+----+--------+
|      original_title|  budget|  revenue|rating|production_companies|year| ratio| url|abstract|
+--------------------+--------+---------+------+--------------------+----+------+----+--------+
|  Gone with the Wind| 4000000|400176459|   7.7|[{'name': 'Selzni...|1939|100.04|null|    null|
|           Halloween|  300000| 70000000|   7.4|[{'name': 'Compas...|1978|233.33|null|    null|
|     Vanishing Point| 1300000| 72266306|   7.1|[{'name': 'Twenti...|1971| 55.59|null|    null|
|Saturday Night Fever| 3500000|237113184|   6.5|[{'name': 'Paramo...|1977| 67.75|null|    null|
|       Dirty Dancing| 6000000|213954274|   7.1|[{'name': 'Great ...|1987| 35.66|null|    null|
|          Goldfinger| 2500000|124881062|   7.2|[{'name': 'United...|1964| 49.95|null|    null|
|   American Graffiti|  777000|140000000|   6.9|[{'name': 'Lucasf...|1973|180.18|null|    null|
|      The Big Parade|  245000| 22000000

# Query Postgres with Pandas (psycopg2)

In [33]:
engine = create_engine(POSTGRES_CONNECTION_STRING)

In [34]:
pd_from_postgres = pd.read_sql("SELECT * FROM {table} WHERE original_title='Star Wars';".format(table=POSTGRES_TABLE), engine)
pd_from_postgres

Unnamed: 0,original_title,budget,revenue,rating,production_companies,year,ratio,url,abstract
0,The Graduate,3000000,104945305,7.6,"[{'name': 'Lawrence Turman', 'id': 8793}]",1967,34.98,,


In [35]:
from_pandas_df = spark.createDataFrame(pd_from_postgres, schema=top100byratio_postgres.schema)
from_pandas_df.show()

+--------------+-------+---------+------+--------------------+----+-----+----+--------+
|original_title| budget|  revenue|rating|production_companies|year|ratio| url|abstract|
+--------------+-------+---------+------+--------------------+----+-----+----+--------+
|  The Graduate|3000000|104945305|   7.6|[{'name': 'Lawren...|1967|34.98|null|    null|
+--------------+-------+---------+------+--------------------+----+-----+----+--------+



# Sources

# https://github.com/databricks/spark-xml
# https://jdbc.postgresql.org/download/
# https://spark.apache.org/docs/2.4.0/sql-data-sources-jdbc.html