# Example connections between Python and PySpark to Oracle and S3.  

## Python only connections

### Oracle to Python 
requires the Oracle client to be installed (/usr/local/lib) and the cx_Oracle package.

VALIDATED:  

In [None]:
import cx_Oracle
import pandas as pd

In [None]:
# Establish the oracle connection
dsn_tns = cx_Oracle.makedsn('brsoda1-scan.corp.espn.pvt', 8685, service_name='EDWTEST')
conn = cx_Oracle.connect(user=r'load_master', password='****', dsn=dsn_tns)
c = conn.cursor()

# 
sql_query = "select owner,table_name,count(*) from sys.all_tab_columns group by owner,table_name"
c.execute(sql_query)
results = c.fetchall()
tmp0_df = pd.DataFrame(data=results,columns=["owner","table_name","num_columns"])

# The row counts are available in all_tables
sql_query = "select owner,table_name,num_rows,avg_row_len from all_tables"
c.execute(sql_query)
results = c.fetchall()
tmp1_df = pd.DataFrame(data=results,columns=["owner","table_name", "num_rows","avg_row_len"])

df = pd.merge(left=tmp0_df,right=tmp1_df, left_on=["owner","table_name"], right_on=["owner","table_name"])
fields = ["owner","table_name","num_columns","num_rows","avg_row_len"]
df = df.loc[:,fields]

# Good citizens close their Oracle connections
conn.close()

In [None]:
# Drop system schema tables.  
df = df.loc[( (sizes_df.owner!="SYS") 
                   & (sizes_df.owner!="SYSTEM")
                  & (sizes_df.owner!="XDB")
                  & (sizes_df.owner!="MDSYS")),]

# Tables with no rows are in all_tables but num_rows is null (-->0)
df.loc[:,'num_rows'].fillna(0, inplace=True)
df.loc[:,'avg_row_len'].fillna(0, inplace=True)

# enforce integer values for num_row (but not avg_row_len)
df.loc[:,'num_rows']= df.loc[:,'num_rows'].astype(int)

# pandas is fussy about direct assignments, two steps
values = df.num_rows * df.avg_row_len / (1000000.00)
df.loc[:,"Sizes_MB"] = values

In [None]:
df.head(10)