In [7]:
import pandas as pd
from pyspark import SparkConf
from pyspark.sql import SparkSession

from dotenv import load_dotenv
from os import getenv

load_dotenv()
sql_server = getenv('SQL_SERVER')
sql_database = getenv('SQL_DATABASE')
sql_user = getenv('SQL_USER')
sql_password = getenv('SQL_PASSWD')

conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.2.2,com.microsoft.azure:spark-mssql-connector_2.12:1.2.0')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.InstanceProfileCredentialsProvider')
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
# SPARK
dfsprouni = spark.read.csv("s3a://pi-g2-cliente/spark-prouni.csv", header=True)
dfstwdayfreq = spark.read.csv("s3a://pi-g2-cliente/spark-tweets-wordfreq.csv", header=True)
dfstwfreq = spark.read.csv("s3a://pi-g2-cliente/spark-tweets-worddaysfreq.csv", header=True)

In [8]:
# PANDAS
dfprouni = pd.read_csv("client_data/prouni.csv")
dftwdayfreq = pd.read_csv("client_data/tweets-worddaysfreq.csv")
dftwfreq = pd.read_csv("client_data/tweets-wordfreq.csv")
dfsprouni = spark.createDataFrame(dfprouni)
dfstwdayfreq = spark.createDataFrame(dftwdayfreq)
dfstwfreq = spark.createDataFrame(dftwfreq)

In [9]:
def write_to_azure(df, table: str):
    df.write \
    .format('com.microsoft.sqlserver.jdbc.spark') \
    .mode('overwrite') \
    .option('driver', 'com.microsoft.sqlserver.jdbc.SQLServerDriver') \
    .option('url', f'jdbc:sqlserver://{sql_server};databaseName={sql_database};') \
    .option('dbtable', table) \
    .option('user', sql_user) \
    .option('password', sql_password) \
    .save()

write_to_azure(dfsprouni, 'PROUNI')
write_to_azure(dfstwdayfreq, 'TWEET_DAYFREQ')
write_to_azure(dfstwfreq, 'TWEET_WORDFREQ')

                                                                                