In [1]:
import os


import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd


from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F
import json



In [3]:
# import sys, subprocess
# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'oauth2client'])

In [4]:
# intialize google sheets config
scope = [
        "https://www.googleapis.com/auth/spreadsheets",
        "https://www.googleapis.com/auth/drive.file",
        "https://www.googleapis.com/auth/drive",
    ]
credential = ServiceAccountCredentials.from_json_keyfile_name(os.environ["SHEET_CREDS"], scope)
client = gspread.authorize(credential)
sh = client.open_by_url(os.environ["SHEET_URL"]).sheet1

In [5]:
# get all records needed in json format
sheet = sh.get_all_records()
sheet[0]

{'COMAPANY': 'Swiss Re ',
 'First Name': 'Amit',
 'Last Name': 'Arora',
 'TITLE': 'Head - Digital Transformations & Process Re-Engineering (Claims)',
 'SALES NAV URL': 'https://www.linkedin.com/sales/people/ACwAAAAwPl0BJvV9sgYFXJDX3KMB0MZcvioGl7o,NAME_SEARCH,L9y9?_ntb=pUdPbnhMQl%2B0L5rF9shozA%3D%3D',
 'LINKEDIN URL': '',
 'Timestamp': '11/23/2022 1:55:29'}

In [6]:
# create dataframe for the data
df = pd.DataFrame(sheet)

In [7]:
# check the first 5 rows
df.head()

Unnamed: 0,COMAPANY,First Name,Last Name,TITLE,SALES NAV URL,LINKEDIN URL,Timestamp
0,Swiss Re,Amit,Arora,Head - Digital Transformations & Process Re-En...,https://www.linkedin.com/sales/people/ACwAAAAw...,,11/23/2022 1:55:29
1,Swiss Re,Tarun,Kohli,Managing Director & Head - New Propositions,https://www.linkedin.com/sales/people/ACwAAACF...,https://www.linkedin.com/in/tarunkohli/,11/23/2022 1:55:29
2,Swiss Re,Rolf,Lux,"Head Strategy & Operations, Global Business So...",https://www.linkedin.com/sales/people/ACwAAAfd...,https://ch.linkedin.com/in/rolf-lux-81b24b38,11/23/2022 1:55:29
3,Swiss Re,Rita,Müller,Head of P&C Business Management UK and Ireland...,https://www.linkedin.com/sales/people/ACwAABj7...,https://www.linkedin.com/in/rita-m%C3%BCller-1...,11/23/2022 1:55:29
4,Swiss Re,Rolf,Bachmann,Head Global Dialogue,https://www.linkedin.com/sales/people/ACwAAAtz...,https://ch.linkedin.com/in/rolfbachmann,11/23/2022 1:55:29


In [8]:
# set columns to lowercase
df.columns = ["company", "first_name", "last_name", "title", "sales_nav_url", "linkedin_url", "operator_timestamp"]

In [9]:
df.columns

Index(['company', 'first_name', 'last_name', 'title', 'sales_nav_url',
       'linkedin_url', 'operator_timestamp'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,company,first_name,last_name,title,sales_nav_url,linkedin_url,operator_timestamp
0,Swiss Re,Amit,Arora,Head - Digital Transformations & Process Re-En...,https://www.linkedin.com/sales/people/ACwAAAAw...,,11/23/2022 1:55:29
1,Swiss Re,Tarun,Kohli,Managing Director & Head - New Propositions,https://www.linkedin.com/sales/people/ACwAAACF...,https://www.linkedin.com/in/tarunkohli/,11/23/2022 1:55:29
2,Swiss Re,Rolf,Lux,"Head Strategy & Operations, Global Business So...",https://www.linkedin.com/sales/people/ACwAAAfd...,https://ch.linkedin.com/in/rolf-lux-81b24b38,11/23/2022 1:55:29
3,Swiss Re,Rita,Müller,Head of P&C Business Management UK and Ireland...,https://www.linkedin.com/sales/people/ACwAABj7...,https://www.linkedin.com/in/rita-m%C3%BCller-1...,11/23/2022 1:55:29
4,Swiss Re,Rolf,Bachmann,Head Global Dialogue,https://www.linkedin.com/sales/people/ACwAAAtz...,https://ch.linkedin.com/in/rolfbachmann,11/23/2022 1:55:29


In [11]:
# remove white_spaces
for col in df.columns:
    if col != "operator_timestamp":
        df[col] = df[col].str.strip()

In [12]:
df.head()

Unnamed: 0,company,first_name,last_name,title,sales_nav_url,linkedin_url,operator_timestamp
0,Swiss Re,Amit,Arora,Head - Digital Transformations & Process Re-En...,https://www.linkedin.com/sales/people/ACwAAAAw...,,11/23/2022 1:55:29
1,Swiss Re,Tarun,Kohli,Managing Director & Head - New Propositions,https://www.linkedin.com/sales/people/ACwAAACF...,https://www.linkedin.com/in/tarunkohli/,11/23/2022 1:55:29
2,Swiss Re,Rolf,Lux,"Head Strategy & Operations, Global Business So...",https://www.linkedin.com/sales/people/ACwAAAfd...,https://ch.linkedin.com/in/rolf-lux-81b24b38,11/23/2022 1:55:29
3,Swiss Re,Rita,Müller,Head of P&C Business Management UK and Ireland...,https://www.linkedin.com/sales/people/ACwAABj7...,https://www.linkedin.com/in/rita-m%C3%BCller-1...,11/23/2022 1:55:29
4,Swiss Re,Rolf,Bachmann,Head Global Dialogue,https://www.linkedin.com/sales/people/ACwAAAtz...,https://ch.linkedin.com/in/rolfbachmann,11/23/2022 1:55:29


In [13]:
def localize_timestamp(df, time_zone):
    """Localize a timestamp to a given timezone."""

    return df.dt.tz_localize(time_zone, ambiguous="infer")


def convert_timestamp(df, time_zone):
    """Convert a timestamp to a given timezone."""

    return df.dt.tz_convert(time_zone)

In [42]:
def null_validation(df):
    """Check for null values."""

    not_null_cols = ["operator_timestamp", "title", "company", "first_name", "last_name"]
    failed_validation = []
    for col in not_null_cols:
        if df[col].isnull().any():
            failed_validation.append(col)
    return failed_validation
            
        
            

In [47]:
def datetime_type_validation(df):
    """Check for datetime type."""

    if df["operator_timestamp"].dtype != "datetime64[ns, UTC]":
        return True
    return False

    
        


In [14]:
# Convert operator_timestamp to datetime type
df["operator_timestamp"] = pd.to_datetime(df["operator_timestamp"])

In [15]:
df["operator_timestamp"].head()

0   2022-11-23 01:55:29
1   2022-11-23 01:55:29
2   2022-11-23 01:55:29
3   2022-11-23 01:55:29
4   2022-11-23 01:55:29
Name: operator_timestamp, dtype: datetime64[ns]

In [16]:
# convert and localize time
df["operator_timestamp"] = localize_timestamp(df["operator_timestamp"], "Africa/Lagos")
df["operator_timestamp"] = convert_timestamp(df["operator_timestamp"], "UTC")

In [17]:
df['operator_timestamp'].head()

0   2022-11-23 00:55:29+00:00
1   2022-11-23 00:55:29+00:00
2   2022-11-23 00:55:29+00:00
3   2022-11-23 00:55:29+00:00
4   2022-11-23 00:55:29+00:00
Name: operator_timestamp, dtype: datetime64[ns, UTC]

In [18]:
df.head()

Unnamed: 0,company,first_name,last_name,title,sales_nav_url,linkedin_url,operator_timestamp
0,Swiss Re,Amit,Arora,Head - Digital Transformations & Process Re-En...,https://www.linkedin.com/sales/people/ACwAAAAw...,,2022-11-23 00:55:29+00:00
1,Swiss Re,Tarun,Kohli,Managing Director & Head - New Propositions,https://www.linkedin.com/sales/people/ACwAAACF...,https://www.linkedin.com/in/tarunkohli/,2022-11-23 00:55:29+00:00
2,Swiss Re,Rolf,Lux,"Head Strategy & Operations, Global Business So...",https://www.linkedin.com/sales/people/ACwAAAfd...,https://ch.linkedin.com/in/rolf-lux-81b24b38,2022-11-23 00:55:29+00:00
3,Swiss Re,Rita,Müller,Head of P&C Business Management UK and Ireland...,https://www.linkedin.com/sales/people/ACwAABj7...,https://www.linkedin.com/in/rita-m%C3%BCller-1...,2022-11-23 00:55:29+00:00
4,Swiss Re,Rolf,Bachmann,Head Global Dialogue,https://www.linkedin.com/sales/people/ACwAAAtz...,https://ch.linkedin.com/in/rolfbachmann,2022-11-23 00:55:29+00:00


In [19]:

# DB_URI = os.environ["DB_URI"]
# engine = create_engine(DB_URI, pool_pre_ping=True)

In [20]:
# with engine.connect() as conn:

#     for i in range(len(df)):
#         try:
#             # Try inserting the row
#             df.iloc[i:i + 1].to_sql(
#                 "lead_export", # table
#                 conn,
#                 schema="public",
#                 if_exists="append",
#                 method=None,  # single row per insert
#                 index=False,
#             )
#         except exc.IntegrityError:
#             # Ignore duplicates
#             pass

In [21]:
# drop redundant columns
df.drop(["sales_nav_url", "linkedin_url"], axis=1, inplace=True)

In [46]:
# null validation
null_validations = null_validation(df)
if null_validations:
    print("Failed Vaildations on:", null_validations)
    
    

In [50]:
# datetime validation

dt_validation = datetime_type_validation(df)
if dt_validation:
    print("Datetime validation failed on: operator_timestamp")

In [49]:
df.to_json("lead_data.json", orient="records")

## Next actions are on lead_export_part2.ipynb