In [1]:
#pip install tiktoken

In [2]:
#pip install chardet

In [3]:
#pip install requests

In [4]:
#pip install html2text

In [5]:
#pip install langchain

In [6]:
#pip install ipywidgets

In [7]:
import csv
import requests
from datetime import datetime
from ipywidgets import widgets, Output, Layout, VBox, HBox
import html2text
from IPython.display import display, clear_output, HTML
import re

import creds
from langchain.chains.summarize import load_summarize_chain
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

In [8]:
# Set up GUI elements
# Upload File Button
upload_button = widgets.FileUpload(
    accept='.csv',
    multiple=False,
    description='Upload File'
)

# Scrap Data Button
scrap_button = widgets.Button(description='Scrap Data')

# Summarize Data Busson
summary_button = widgets.Button(description='Summarize Data')

# Map Selector
map_selector = widgets.SelectMultiple(
    options=[],
    description='Map:',
    layout=Layout(width='40%')
)

# Reduce Selector
reduce_selector = widgets.SelectMultiple(
    options=[],
    description='Reduce:',
    layout=Layout(width='40%')
)

# Map Prompt Text Area
prompt_input_map = widgets.Textarea(
    value='',
    placeholder='Enter prompt text (Map)...',
    description='Prompt (M):',
        layout=Layout(width='100%', height='150px')
)

# Reduce Prompt Text Area
prompt_input_reduce = widgets.Textarea(
    value='',
    placeholder='Enter prompt text (Reduce)...',
    description='Prompt (R):',
        layout=Layout(width='100%', height='150px')
)

# Slider widget
slider = widgets.FloatSlider(
    value=0.7,  # Initial value
    min=0,      # Minimum value
    max=1,      # Maximum value
    step=0.01,  # Step size
    description='Temperature:'
)

output = Output()

# Create a placeholder for uploaded data
uploaded_data = None

In [9]:
# Set up Langchain elements
# Split up the texts so you don't run into token limits
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 8000,
    chunk_overlap  = 0
)

In [10]:
def check_usage():
    # API headers
    headers = {'Authorization': f'Bearer {creds.openai_api_key}'}

    # API endpoint
    url = 'https://api.openai.com/v1/usage'

    # Get the usage date for current date
    date = datetime.now()

    # Parameters for API request
    params = {'date': date.strftime('%Y-%m-%d')}

    # Send API request and get response
    response = requests.get(url, headers=headers, params=params)
    print(response.json())
    if(response.json()['data']):
        usage_data = response.json()['data']
    
    print(usage_data)


In [11]:
# Set up functions
# Read CSV file and returns the data
def read_csv(file_path, encoding='utf-8'):
    companies = []
    with open(file_path, 'r', encoding=encoding) as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            companies.append(row)
    return companies

In [12]:
# Scraps and saves the data to the output file
def scrape_and_save_data(data):
    with output:
        clear_output(wait=True)

        # Assume 'Company Website' is a column in the CSV
        company_website_column = 'Company Website'
        new_column_name = 'Scraped Content'

        for row in data:
            company_website = row[company_website_column]
            # Calls the scrape_website function
            scraped_content = scrape_website(company_website)
            row[new_column_name] = scraped_content

        output_csv_path = 'scraped_data.csv'
        with open(output_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=data[0].keys())
            writer.writeheader()
            writer.writerows(data)

In [13]:
# Scraps the website using scraperapi and returns the scraped data
def scrape_website(url):
    payload = {'api_key': creds.scrapper_api_key, 'url': url}
    scraped_data = requests.get('http://api.scraperapi.com', params=payload)
    if scraped_data.text.strip():
        text_maker = html2text.HTML2Text()
        text_maker.ignore_links = True
        text_maker.SKIP_INTERNAL_LINKS  = True
        text_maker.IGNORE_ANCHORS = True
        text_maker.IGNORE_IMAGES = True
        text_maker.UNICODE_SNOB = True
        text_maker.ESCAPE_SNOB = True
        scraped_text_data = text_maker.handle(scraped_data.text)
       
    if scraped_text_data.strip():
        return re.sub(r"http\S+", "", scraped_text_data.replace('\n', '\\n').replace('\t', '\\t'))
    
    return ""

In [14]:
# Uses langchain's map-reduce to summarize and generate the personalized emails
def summarize_and_save_data(data, map_prompt, combine_prompt):
    llm = OpenAI(model='text-davinci-003', temperature=slider.value, openai_api_key=creds.openai_api_key)
    
    map_columns = re.findall(r'\{([^}]+)\}', map_prompt)
    map_input_variables = [match for match in map_columns]
    map_input_variables = list(dict.fromkeys(map_input_variables))
    #print(map_input_variables)
    map_prompt_template = PromptTemplate(template=map_prompt, input_variables=map_input_variables)
    
    combine_columns = re.findall(r'\{([^}]+)\}', combine_prompt)
    combine_input_variables = [match for match in combine_columns]
    combine_input_variables = list(dict.fromkeys(combine_input_variables))
    #print(combine_input_variables)
    combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=combine_input_variables)
   
    chain = load_summarize_chain(llm,
                         chain_type="map_reduce",
                         map_prompt=map_prompt_template,
                         combine_prompt=combine_prompt_template,
                         verbose=True
                       )
    
    map_column_values = {}
    
    for company in data:
        new_column_name = 'Personalized Email'
        page_data = company['Scraped Content']
        
        #print('len(page_data)', len(page_data))
        
        chunks = text_splitter.split_text(page_data)
        docs = text_splitter.create_documents(chunks)
        
        #print('docs length', len(docs))
        #print(docs)
        #print('chunks length', len(chunks))
        #print(chunks)
        
        for map_column in map_columns:
            if map_column != 'text':
                map_column_values[map_column] = company[map_column]
       
        map_column_values["input_documents"] = docs
        
        #print(map_column_values)
        
        output = chain(map_column_values)
        
        print (output['output_text'])
        print ("\n\n")
        
        personalized_email_data = re.sub(r"http\S+", "", output['output_text'].replace('\n', '\\n').replace('\t', '\\t'))
        company[new_column_name] = personalized_email_data

In [15]:
# Calls the scrape_and_save_data method when Scrap Data button is clicked
def on_scrap_button_click(button):
    global uploaded_data
    input_csv_path = next(iter(upload_button.value))
    if uploaded_data:
        with output:
            clear_output(wait=True)
            print("Scraping data...")
            scrape_and_save_data(read_csv(input_csv_path))
            print("Data scraped and saved!")

In [16]:
# Calls the summarize_and_save_data method when Summarize Data is clicked
def on_summary_button_click(button):
    global uploaded_data
    input_csv_path = next(iter(upload_button.value))
    if uploaded_data:
        with output:
            clear_output(wait=True)
            print("Summarizing data...")
            
            data = read_csv(input_csv_path)
            summarize_and_save_data(data, prompt_input_map.value, prompt_input_reduce.value)
            
            output_csv_path = 'personalized_email_data.csv'
            with open(output_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
                writer = csv.DictWriter(csv_file, fieldnames=data[0].keys())
                writer.writeheader()
                writer.writerows(data)
            
            print("Data summarized and saved!")

In [17]:
# Once the file is uploaded, this methods shows the file column names in the column selectors
def show_columns(button):
        global uploaded_data
        uploaded_data = button['new']
        scrap_button.disabled = not upload_button.value
        if upload_button.value:
            input_csv_path = next(iter(upload_button.value))
            with open(input_csv_path, 'r', encoding='utf-8') as csv_file:
                csv_reader = csv.reader(csv_file)
                headers = next(csv_reader)
                map_selector.options = headers
                reduce_selector.options = headers     
            map_selector.options = map_selector.options + ('text',)
            reduce_selector.options = reduce_selector.options + ('text',)    
            
            scrap_button.disabled = False

In [18]:
# Sets the prompt_input_map variable whenever there is a change in the map column selector
def on_map_selector_change(change):
    selected_column = map_selector.value
    if selected_column:
        formatted_cols = ["{{{}}}".format(num) for num in selected_column]
        prompt_input_map.value = ', '.join(formatted_cols)
        

In [19]:
# Sets the prompt_input_reduce variable whenever there is a change in the reduce column selector
def on_reduce_selector_change(change):
    selected_column = change['new']
    if selected_column:
        formatted_cols = ["{{{}}}".format(num) for num in selected_column]
        prompt_input_reduce.value = ', '.join(formatted_cols)

In [20]:
def on_prompt_change(change):
    summary_button.disabled = not prompt_input_map.value or not prompt_input_reduce.value

In [21]:
# Set up observers
upload_button.observe(show_columns, names='value')
map_selector.observe(on_map_selector_change, names='value')
reduce_selector.observe(on_reduce_selector_change, names='value')
prompt_input_map.observe(on_prompt_change, names='value')
prompt_input_reduce.observe(on_prompt_change, names='value')

scrap_button.on_click(on_scrap_button_click)
summary_button.on_click(on_summary_button_click)

column_selectors = HBox([map_selector, reduce_selector])
prompt_inputs = HBox([prompt_input_map, prompt_input_reduce])

# Disable Scrap and Summary button by default
scrap_button.disabled = True
summary_button.disabled = True

# Check Usage
# check_usage()

# Display the slider and the variable
display(slider)

# Display GUI elements
display(upload_button)
display(scrap_button)
display(summary_button)
display(column_selectors)
display(prompt_inputs)
display(output)

FloatSlider(value=0.7, description='Temperature:', max=1.0, step=0.01)

FileUpload(value=(), accept='.csv', description='Upload File')

Button(description='Scrap Data', disabled=True, style=ButtonStyle())

Button(description='Summarize Data', disabled=True, style=ButtonStyle())

HBox(children=(SelectMultiple(description='Map:', layout=Layout(width='40%'), options=(), value=()), SelectMul…

HBox(children=(Textarea(value='', description='Prompt (M):', layout=Layout(height='150px', width='100%'), plac…

Output()