This is a collection of all common python codes useful for Import and Export of various common data sources.

Created by: Grace Choo

Date of Creation: 19 September 2022


# The Zen of Python, by Tim Peters

Beautiful is better than ugly.

Explicit is better than implicit.

Simple is better than complex.
Complex is better than complicated.

Flat is better than nested.
Sparse is better than dense.

Readability counts.

Special cases aren't special enough to break the rules.

Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.

In the face of ambiguity, refuse the temptation to guess.

There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.

Now is better than never.
Although never is often better than *right* now.

If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.

Namespaces are one honking great idea -- let's do more of those!


# Import Files

## Using Pandas

### Import only single file

In [None]:
import pandas as pd
filename=r"C:\Users\Documents\Downloads\titanic.txt"
data=pd.read_csv(filenamename, sep='^')
data.head()

### Import Multiple Files and append into a single file

In [None]:
#First create a function/macro
def F_findFile(pattern, path):
    result = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if fnmatch.fnmatch(name, pattern):
                result.append(os.path.join(name))
    return result

FilePath = r"C:\Users\Documents\Downloads"

#import all cancel log file
ListofFiles = []
ListofFiles = F_findFile('POL_NONMOTOR_*.txt', FilePath)


for file in ListofFiles:
    data = pd.read_csv(FilePath + "\\" + file, sep='^')
    AllFile.append(data)

#combine all imported data into 1 table
AllFile = pd.concat(AllFile)

## import data end ##


## additional things that can be useful ##

# Read the first 5 rows of the file into a DataFrame: data and no header
data=pd.read_csv(file,header=None,nrows=5)


#remove all duplicates
AllFile = AllFile.drop_duplicates()

## Import Pickled Files
- Use this if you want to safe smaller files. More efficient than csv but less efficient than Parquet Files.

In [None]:
import pickle
with open('pickled_fruit.pkl','rb') as file:
    data = pickle.load(file)
print(data)

## Import Parquet Files
- use this if you have to save large dataset and for space and time efficiency
- need to install a package called "fastparquet"

In [None]:
import pandas as pd

df = pd.read_parquet('test.parquet')

In [None]:
#if want to read only specific columns
df = pd.read_parquet('test.parquet', columns=['COL1','COL2'])

## Import SAS files

In [None]:
# Import sas7bdat package
from sas7bdat import SAS7BDAT

# Save file to a DataFrame: df_sas
with SAS7BDAT('sales.sas7bdat') as file:
    df_sas = file.to_data_frame()

# Print head of DataFrame
print(df_sas.head())



## Import SQL Table (works only on MS Azure Databricks)

### Import from icekachangz server, to_sql method

In [None]:
import urllib
import pyodbc
import pandas as pd
from sqlalchemy import create_engine

In [None]:
%sh
curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
curl https://packages.microsoft.com/config/ubuntu/16.04/prod.list > /etc/apt/sources.list.d/mssql-release.list
apt-get update
ACCEPT_EULA=Y apt-get install msodbcsql17
apt-get -y install unixodbc-dev
sudo apt-get install python3-pip -y
pip3 install --upgrade pyodbc

In [None]:
dbutils.fs.put("/databricks/init/<YourClusterName>/pyodbc-install.sh","""
#!/bin/bash
sudo apt-get update
sudo apt-get -q -y install unixodbc unixodbc-dev
sudo apt-get -q -y install python3-dev
/databricks/python/bin/pip install pyodbc
""", True)
for driver in pyodbc.drivers():
    print(driver)

In [None]:
#You can get these info from Connection strings under ODBC of your desired sql database.
server = "{Insert your SQL Server here}"                #SQL Server
port = "1433"
database = "{Insert your Database here}"                #SQL Database
username = "{Insert your Username here}"                #Username
password = "{Insert your password here}"                #Password

cnxn_str = "DRIVER="+driver+";SERVER="+server+","+port+";Database="+database+";Uid="+username+";Pwd="+password+";"
cnxn = pyodbc.connect(cnxn_str)
cursor = cnxn.cursor()

In [None]:
data = pd.read_sql("SELECT TOP(100) * FROM {Insert your data table here}", cnxn)

#Check table
data.head(5)

###  Import from icekachangz server, spark method

In [None]:
#You can get these info from Connection strings under JDBC of your desired sql database.
jdbcHostname = "{Insert your SQL Server here}"                #SQL Server
jdbcPort = 1433
jdbcDatabase = "{Insert your Database here}"                #SQL Database
jdbcUsername = "{Insert your Username here}"                #Username
jdbcPassword = "{Insert your password here}"                #Password


#Create the JDBC URL without passing in the user and password parameters. (no need to change anything here)
jdbcUrl = f"jdbc:sqlserver://{jdbcHostname}:{jdbcPort};database={jdbcDatabase};user={jdbcUsername};password={jdbcPassword}"

In [None]:
#SparkDF is a spark table
SparkDF = spark.read.format("jdbc").option("url", jdbcUrl).option("dbtable","{Insert Your Table here}").load()
display(SparkDF)

## Import SQL Table (Can be used in jupyter notebook)

In [None]:
import pyodbc
import pandas as pd
import os, fnmatch

server = "{Insert your SQL Server here}"
database = "{Insert your Database here}"

#Connection String
connection = pyodbc.connect('DRIVER={SQL Server Native Client 11.0};SERVER='+server+';DATABASE='+database+';Trusted_Connection=yes;')
cursor = connection.cursor()

SQLQuery = "select * from {Insert Your Table here}"
df = pd.read_sql_query(SQLQuery, connection)
df.head()

## Import DBF File

In [None]:
import pandas as pd
from dbfread import DBF

#import data in this location if CutMonth has value of more than 9
dbfFile = r"C:\Users\Documents\Downloads\{Insert your dbf file name here}.DBF"

#Import Exposure DBF data as dataframe
NewDS=DataFrame(iter(DBF(dbfFile)))

## Import Excel File

In [None]:
import pandas as pd

df = pd.read_excel(r'Path where the Excel file is stored\File name.xlsx', sheet_name='your Excel sheet name')
print (df)

In [None]:
# If want to skip rows

df = pd.read_excel(r'Path where the Excel file is stored\File name.xlsx', sheet_name='your Excel sheet name', skiprows=4)

# Export Files

## to_csv Method

### .csv file

In [None]:
filename=r"C:\Users\Documents\Downloads\titanic.csv"
df.to_csv(filename, index=False)

#if got UnicodeEncodeError then use this:
df.to_csv(filename, index=False, encoding='utf-8')

### .txt file

In [None]:
filename=r"C:\Users\Documents\Downloads\titanic.csv"
df.to_csv(filename, index=False, sep="^")

## open() method

In [None]:
filename=r"C:\Users\Documents\Downloads\titanic.csv"
file=open(filename,mode='w') # where w is to write
file.close

## to_excel method

### These will replace existing files. proceed with caution

In [None]:
filename=r"C:\Users\Documents\Downloads\titanic.xlsx"
df.to_excel(filename, index=False, header=True)

In [None]:
#assign sheet name:
filename=r"C:\Users\Documents\Downloads\titanic.xlsx"
df.to_excel(filename, index=False, header=True, sheet_name='Sheet_name_1')

### to output new sheet without replacing other existing sheets

In [None]:
from openpyxl import load_workbook
import pandas as pd
filename=r"C:\Users\Documents\Downloads\titanic.xlsx"
book = load_workbook(filename)
writer = pd.ExcelWriter(filename, engine = 'openpyxl')
writer.book = book
Combine_all_List.to_excel(writer, index=False, sheet_name = 'DataListing')
writer.close()


## Export Pickle Files

In [None]:
df.to_pickle(Path_Location + "yourtablename.pkl")

## Export Parque Files

In [None]:
df.to_parquet(Path_Location + "yourtablename.parquet")

## Export tables into SQL Server

### using Database Username and Password

In [None]:
#You can get these info from Connection strings under ODBC of your desired sql database.
#driver = "{ODBC Driver 17 for SQL Server}"
server = "{Insert your SQL Server here}"                #SQL Server
port = "1433"
database = "{Insert your Database here}"                #SQL Database
username = "{Insert your username here}"                #Username
password = "{Insert your password here}"                #Password

# to check for drivers
for driver in pyodbc.drivers():
    print(driver)

In [None]:
odbc_str = 'DRIVER=' + driver + ';SERVER=' + server + ';PORT=1433;UID=' + username + ';DATABASE=' + database + ';PWD=' + password
connect_str = 'mssql+pyodbc:///?odbc_connect=' + urllib.parse.quote_plus(odbc_str)
engine = create_engine(connect_str)

def to_sql(df, table):
    df.to_sql(table, engine, if_exists = "append", index=False, chunksize = 100)
    
    

In [None]:
#Excecute export table to target sql database
to_sql(POL_MOTOR, "POL_MOTOR")
to_sql(POL_NONMOTOR, "POL_NONMOTOR")
to_sql(POL_LIFE, "POL_LIFE")

### using Windows Authentication

In [None]:
import urllib
import pyodbc
import pandas as pd
from sqlalchemy import create_engine
#You can get these info from Connection strings under ODBC of your desired sql database.
server = "{Insert your SQL Server here}"                #SQL Server
database = "{Insert your Database here}" 

odbc_str = 'DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';Trusted_Connection=yes;'
connect_str = 'mssql+pyodbc:///?odbc_connect=' + urllib.parse.quote_plus(odbc_str)
engine = create_engine(connect_str)
def to_sql(df, table):
    df.to_sql(table, engine, if_exists = "replace", index=False, chunksize = 100)


#Excecute export table to target sql database
to_sql(Mail03, "FACT_MYA_EMAIL")
