In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
from pyspark.sql.types import StructField, StructType
from pyspark.sql.types import StringType, IntegerType,DoubleType
import pyspark.sql.functions as F
import numpy as np
import pandas as pd

In [4]:
local_path = "/home/talentum/shared/dataSource/sd254_users.csv"
if os.path.exists(local_path):
    print("Found local file. Uploading to HDFS...")
    os.system(f"hdfs dfs -put -f {local_path} /user/talentum/projectMaster/dataStaging/sd254_users.csv")
    print("Upload complete.")
else:
    print(f"ERROR: Local file not found at {local_path}")

Found local file. Uploading to HDFS...
Upload complete.


In [5]:
# creating schema for the underlying data to enforce lazy evaluation
schema = StructType([
    StructField('Person', StringType(), True),
    StructField('Current Age', IntegerType(), True),
    StructField('Retirement Age', IntegerType(), True),
    StructField('Birth Year', IntegerType(), True),
    StructField('Birth Month', IntegerType(), True),
    StructField('Gender', StringType(), True),
    StructField('Address', StringType(), True),
    StructField('Apartment', StringType(), True),
    StructField('City', StringType(), True),
    StructField('State', StringType(), True),
    StructField('Zipcode', IntegerType(), True),
    StructField('Latitude', DoubleType(), True),
    StructField('Longitude', DoubleType(), True),
    StructField('Per Capita Income - Zipcode', StringType(), True),
    StructField('Yearly Income - Person', StringType(), True),
    StructField('Total Debt', StringType(), True),
    StructField('FICO Score', IntegerType(), True),
     StructField('Num Credit Cards', IntegerType(), True)
])

In [6]:
df = spark.read.csv('/user/talentum/projectMaster/dataStaging/sd254_users.csv', header = True, schema = schema, mode="PERMISSIVE")

In [7]:
#renaming column names to enter into parquet file as it does not support spaces, or special char in column names
new_columns = [
    "Person_ID",
    "Current_Age",
    "Retirement_Age",
    "Birth_Year",
    "Birth_Month",
    "Gender",
    'Address',
    'Apartment',
    'City',
    'State',
    'Zipcode',
    'Latitude',
    'Longitude',
    'Per_Capita_Income_Zipcode',
    'Yearly_Income_Person',
    'Total_Debt',
    'FICO_Score',
    'Num_Credit_Cards'
]

# Apply the toDF() method to rename all columns
df_renamed = df.toDF(*new_columns)

In [8]:
#saving data to hdfs and local dataStaging area in parquet data format
df_renamed.write.mode("overwrite").parquet('/user/talentum/projectMaster/dataStaging/sd254_users.parquet')