In [15]:
import logging
import openpyxl
import re
import traceback
import pandas as pd
import requests as rq
from collections import defaultdict
from random import randint

### Define Constants

In [46]:
GOOGLE_SHEET_LINK = 'https://docs.google.com/spreadsheets/d/1u9xez7cCm2zTK65cn1WrtmKDK2FuqfckQ44PxGXfmLc/edit#gid=0'

EXCEL_FILENAME = 'google_sheet_excel.xlsx'

SQL_IMPLEMENTATION = 'postgres' ### postgres, mysql, sqlserver

SHOW_PRINT = False

ERROR_LOG_STRING_FORMAT = '''
    {section_title}
    ---
    {description}
    ---
    {error_message}
''' + '='*60 + '\n\n'

In [47]:
INCONSISTENT_COLUMNS = defaultdict(
    lambda : {'datetime' : 'datetime'},
    {
        'postgres': {'datetime' : 'timestamp'}
    }
)

DTYPES_MAPPING = {
    'object' : 'varchar(255)',
    'int' : 'int',
    'float' : 'float'
}

DTYPES_MAPPING = {**DTYPES_MAPPING, **INCONSISTENT_COLUMNS[SQL_IMPLEMENTATION]}
if SHOW_PRINT:
    print(DTYPES_MAPPING)

### Logging Helper

In [12]:
def error_logging_helper(log_filename:str = 'all_logs.log', logger_name:str = 'error_logger'):
    '''
    helper function for logging errors
    '''
    logger = logging.getLogger(logger_name)
    handler = logging.FileHandler(
        filename=log_filename,
        mode='a+'
    )
    log_format = logging.Formatter('%(asctime)s - %(message)s')
    handler.setLevel(logging.INFO)
    handler.setFormatter(log_format)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)
    print(logger.getEffectiveLevel())
    
    return logger

logger = error_logging_helper('all_logs.log')

20


### Download Google Sheet as Excel

In [43]:
def download_xlsx_from_google_sheet():
    '''
    function for downloading google sheet as xlsx file
    
    Returns: boolean value based on success/failure
    '''
    return_bool_value = True
    
    if 'docs.google.com' not in GOOGLE_SHEET_LINK and 'spreadsheets' not in GOOGLE_SHEET_LINK:
        logger.error(
            ERROR_LOG_STRING_FORMAT.format(
                section_title = 'Error occured in downloading google sheet. Please check google sheet link',
                description = GOOGLE_SHEET_LINK,
                error_message = GOOGLE_SHEET_LINK
            )
        )
        return_bool_value = False
    else:
        try:
            ### extract id and try downloading google sheet
            google_sheet_id = re.search(
                pattern = r'(\/d\/)(\w+)(\/)',
                string = GOOGLE_SHEET_LINK
            ).group(2)

            sheet_download_rq = rq.get(
                url = 'https://docs.google.com/spreadsheets/u/0/d/' + google_sheet_id + "/export",
                params = {
                    'format' : 'xlsx',
                    'id' : google_sheet_id
                },
                stream = True
            )
    
            ### if permission error log error else create xlsx file
            if 'sign in' in sheet_download_rq.text.lower():
                logger.error(
                    ERROR_LOG_STRING_FORMAT.format(
                        section_title = 'Error occured in downloading google sheet. Please check google sheet link',
                        description = GOOGLE_SHEET_LINK,
                        error_message = sheet_download_rq.url
                    )
                )
                return_bool_value = False
            else:
                with sheet_download_rq as f:
                    with open(EXCEL_FILENAME, 'wb') as output:
                        output.write(f.content)
                    
                return_bool_value = True
            
        except Exception as e:
            logger.error(
                ERROR_LOG_STRING_FORMAT.format(
                    section_title = 'Error occured in downloading google sheet. Please check google sheet link: ' + GOOGLE_SHEET_LINK,
                    description = str(e),
                    error_message = str(traceback.format_exc())
                )
            )
            
            return_bool_value = False
            
    return return_bool_value

### Open Downloaded Excel and Convert to DataFrame

In [6]:
def convert_sheets_to_dfs(workbook:openpyxl.workbook.workbook.Workbook, sheet_name:str):
    '''
    takes the sheetname and openpyxl workbook and returns a dataframe 
    and dataypes of each column
    
    Inputs:
        workbook: openpyxl workbook variable obtained using openpyxl.load_workbook()
        sheet_name: name of the sheet
    
    Returns: dataframe df and df.dtypes
    '''
    sheet_data = workbook[sheet_name].values
    df = pd.DataFrame(sheet_data, columns = next(sheet_data))
    df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
    
    if SHOW_PRINT:
        print(df.head(3), end="\n\n")
        print(df.dtypes, end="\n\n")
    
    return df, df.dtypes

### Helper Functions for Generating Queries

In [7]:
def generate_create_table_query(dtype_df:pd.core.series.Series, table_name:str):
    '''
    function for generating the create ddl query
    
    Inputs:
        - dtype_df: pandas series with dtypes obtained form the `convert_sheets_to_dfs` function
        - table_name: name of the table to be created
        
    Returns:
        - string with create ddl query
    '''
    
    create_table_structure = '''
    create table {table_name} (
    {columns_dtypes}
    );
    '''
    
    complete_colname_dtype_string = ''

    for col, dtype in dtype_df.iteritems():
        cleaned_dtype = re.search(r'([a-zA-Z]+)', str(dtype)).group()
        colname_dtype_string = '\t' + col + ' ' + DTYPES_MAPPING[cleaned_dtype] + ',\n'
        complete_colname_dtype_string += colname_dtype_string

    complete_colname_dtype_string = complete_colname_dtype_string.rsplit(
        sep=',',
        maxsplit=1
    )[0]
    
    create_table_query_populated = create_table_structure.format(
        table_name = table_name,
        columns_dtypes = complete_colname_dtype_string
    )
    
    print(create_table_query_populated)
    
    return create_table_query_populated

In [8]:
def generate_insert_values_query(df:pd.DataFrame, table_name:str):
    '''
    function for generating the `insert into table values` query
    
    Inputs:
        - df: pandas dataframe obtained from the `convert_sheets_to_dfs` function
        - table_name: name of the table to be created
        
    Returns:
        - string with insert query
    '''
    
    insert_query_structure = '''
    insert into 
        {table_name} ({col_names}) 
    values
    {values_query_string}
    ;
    
    select * from {table_name};
    '''
    
    ### create a new col in the dataframe with all other columns concatenated as string
    df['final_col'] = ''
    
    for col in df.dtypes.index:
        if col != 'final_col':
            df['final_col'] += "'" + df[col].astype(str) + "', "
            
    df['final_col'] = df['final_col'].apply(lambda x: "\t(" + x.rsplit(sep=',', maxsplit=1)[0] + ")")
    
    insert_values_query_string = ',\n'.join(df['final_col'].values)
    colnames_string = ', '.join(df.drop('final_col', axis=1).dtypes.index)
    
    insert_query_populated = insert_query_structure.format(
        table_name = table_name,
        col_names = colnames_string,
        values_query_string = insert_values_query_string
    )
    
    print(insert_query_populated)
    
    return insert_query_populated

### Connect Everything Together

In [48]:
download_check = download_xlsx_from_google_sheet()

if download_check:
    wb = openpyxl.load_workbook(EXCEL_FILENAME)
    print(wb.sheetnames)
    
    for sheet_name in wb.sheetnames:
        ### clean up sheet name to make a valid table name
        table_name = sheet_name.replace(' ', '')
        table_name = ''.join(
            re.findall('(\w+)', table_name)
        )
        if table_name == '':
            table_name = 'table_'+str(randint(1, 100000))

        print("###"*15, " ", sheet_name, ":", table_name, " ", "###"*15)

        df, dtype_df = convert_sheets_to_dfs(wb, sheet_name)

        print("~~~"*15)
        create_table_query_populated = generate_create_table_query(dtype_df, table_name)
        insert_query_populated = generate_insert_values_query(df, table_name)
        print("~~~"*15)

        print("###"*35, end="\n\n")
else:
    print(download_check)

['!@#%^&(($%', 'table 2', 'table 3']
#############################################   !@#%^&(($% : table_85971   #############################################
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    create table table_85971 (
    	merchant_id varchar(255),
	merchant_name varchar(255),
	state varchar(255),
	city varchar(255),
	mer_score float,
	date timestamp
    );
    

    insert into 
        table_85971 (merchant_id, merchant_name, state, city, mer_score, date) 
    values
    	('m1', 'm1', 'Delhi', 'Delhi', '1.0', '2022-01-01'),
	('m2', 'm2', 'Delhi', 'Delhi', '2.0', '2022-01-02'),
	('m3', 'm3', 'Delhi', 'Delhi', '5.0', '2022-01-03'),
	('m4', 'm4', 'Delhi', 'Delhi', '6.0', '2022-01-04'),
	('m5', 'm5', 'Delhi', 'Delhi', '7.0', '2022-01-05'),
	('m6', 'm6', 'Delhi', 'Delhi', '8.0', '2022-01-06'),
	('m7', 'm7', 'Delhi', 'Delhi', '9.0', '2022-01-07'),
	('m8', 'm8', 'Delhi', 'Delhi', '10.0', '2022-01-08'),
	('m9', 'm9', 'Delhi', 'Delhi', '11.0', '2022-01-09'),
	('m10', 'm10', 