In [1]:
import os
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

In [2]:
from pyspark.sql.functions import to_json

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.2.0 pyspark-shell'

In [4]:
conf = SparkConf()\
    .setAppName("pyspark mongo")\
    .setMaster("local[*]")\
    .set("spark.mongodb.input.uri", "mongodb://accretioadmin:adminaccretio&2017@localhost:27017")\
    .set("spark.mongodb.input.database", "coreRh")\
    .set("spark.mongodb.input.collection", "employee")


In [5]:
if 'sc' in locals():
    sc.stop()
sc = SparkContext(conf=conf)#.getOrCreate()
sql_context = SQLContext(sc)
df = sql_context\
    .read\
    .format("com.mongodb.spark.sql.DefaultSource")\
    .load()

# Contract

In [6]:
df.select("contract").take(1)[0]

Row(contract=Row(_id=None, contract_duration=u'', contract_interim_organization=None, contract_nature=u'CNT_CDI', contract_planned_end_date=None, contract_start_date=datetime.datetime(1980, 3, 2, 1, 0), contract_type=u'CDI'))

In [7]:
df_contract = df.select("contract").collect()

In [8]:
contracts=list()
for i in df_contract:
    #print(i.status)
    d = i.contract.asDict()
    contracts.append(d["contract_type"])

In [9]:
contracts[1:10]

[u'CDI', u'CDI', u'CDI', u'CDI', u'CDI', u'CDI', u'CDI', u'CDI', u'CDI']

In [10]:
df_emp = df.select(["registration_number","first_name","last_name","sexe","hobbies"])

In [11]:
df_emp.show(2)

+-------------------+----------+---------+-----+--------------------+
|registration_number|first_name|last_name| sexe|             hobbies|
+-------------------+----------+---------+-----+--------------------+
|               9999|    Admin | Accretio|SX_MA|[TRAVEL, MUSIC, C...|
|           DFG00030|   Blanche|Beauchamp|SX_FE|                null|
+-------------------+----------+---------+-----+--------------------+
only showing top 2 rows



In [12]:
from pyspark.sql import functions as F

In [13]:
df_emp = df_emp.withColumn("hobbies", F.when(F.col("hobbies").isNull(),  F.array()).otherwise(F.col("hobbies")))

In [14]:
from pyspark.sql.functions import udf, col

join_udf = udf(lambda x: ",".join(x))
df_emp = df_emp.withColumn("hobbies", join_udf(col("hobbies")))

In [15]:
df_emp.printSchema()

root
 |-- registration_number: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- sexe: string (nullable = true)
 |-- hobbies: string (nullable = true)



In [None]:
#df_emp.select("_id").show(1)

In [None]:
#df_emp = df_emp.withColumn("_id", col("_id").getField("oid"))

In [16]:
df_emp.printSchema()

root
 |-- registration_number: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- sexe: string (nullable = true)
 |-- hobbies: string (nullable = true)



In [17]:
df_emp.show(1)

+-------------------+----------+---------+-----+--------------------+
|registration_number|first_name|last_name| sexe|             hobbies|
+-------------------+----------+---------+-----+--------------------+
|               9999|    Admin | Accretio|SX_MA|TRAVEL,MUSIC,CAME...|
+-------------------+----------+---------+-----+--------------------+
only showing top 1 row



In [18]:
from pyspark.sql import Row

In [19]:
l = sc.parallelize(contracts)
new_row = Row("contract")
l_as_df = l.map(new_row).toDF()

In [20]:
pandas_df = df_emp.toPandas()

In [21]:
pandas_df["contract"] = contracts

In [22]:
pandas_df.head(1)

Unnamed: 0,registration_number,first_name,last_name,sexe,hobbies,contract
0,9999,Admin,Accretio,SX_MA,"TRAVEL,MUSIC,CAMERA,FOOT",CDI


In [23]:
pandas_df.columns

Index([u'registration_number', u'first_name', u'last_name', u'sexe',
       u'hobbies', u'contract'],
      dtype='object')

In [24]:
import psycopg2

In [25]:
conn = psycopg2.connect(
        host = "localhost",
        port = "5432",
        database = "AccretioDW",
        user = "postgres",
        password = "123456789")

In [26]:
cur = conn.cursor()

In [27]:
def create_table(cursor):
    cursor.execute("CREATE TABLE IF NOT EXISTS employee \
    (   registration_number VARCHAR(255) PRIMARY KEY, \
        first_name VARCHAR(255) , \
        last_name VARCHAR(255) , \
        sexe VARCHAR(255) , \
        hobbies VARCHAR(255) , \
        contract VARCHAR(255) );")

In [28]:
create_table(cur)

In [29]:
conn.commit()

In [30]:
def write_postgresql(df):

    emp_seq =  pandas_df

    records_list_template = ','.join(['%s'] * len(emp_seq))
    print(emp_seq)
    insert_query = "INSERT INTO employee ( registration_number, first_name, last_name, sexe,hobbies, contract) VALUES {}".format(records_list_template)

    return insert_query, emp_seq

In [31]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:123456789@localhost:5432/AccretioDW')

In [32]:
pandas_df.to_sql('employee', con = engine, if_exists = 'append', chunksize = 1000,index=False)