In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%pip install -r scraping_requirements.txt

In [None]:
#@title Scraping module for obtaining the textual dataset
#@markdown As the title explains, this module was exploited to scrape from the website "www.thinkslogans.com" as many as possible advertisements slogans, which will then be used to train the text-generation model<br>
#@markdown This operation was performed using BeatifulSoup to perform the parsing on the requests results. Those requests were directed to several slogan categories available on the website, and along with the slogan, also the brand name and the category were obtained.<br>The last step was to save the result in a .csv file
import requests
import csv
import sys
import time
import re

from bs4 import BeautifulSoup

category_list=['airline', 'alcohol', 'bank', 'beverage', 'candy', 'car-brand', 'cereal', 'coffee', 'computers', 'electronic-products', 'fast-food', 'motorcycle', 'perfume', 'toothpaste']
max_page_count = 12
output_file = 'sloganlist.csv'

def remove_html_tags(text):
  clean = re.compile(r"<.*?>| \n |\[|\]|–|, | \'")
  return re.sub(clean, '', text)



def get_data(url):
  # In case of Status 403 (Forbidden), wait for some time (maybe hours) before retrying
  headers = {
  "Connection": "keep-alive",
  "Upgrade-Insecure-Requests": "1",
  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36",
  "Sec-Fetch-Dest": "document",
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  "Sec-Fetch-Site": "none",
  "Sec-Fetch-Mode": "navigate",
  "Sec-Fetch-User": "?1",
  "Accept-Encoding": "gzip, deflate, br",
  "Accept-Language": "en-US,en;q=0.9"
  }

  try:
    return requests.get(url, headers=headers).text
  except Exception:
    time.sleep(1)
    return requests.get(url, headers=headers).text
  print(requests.get(url).status_code)

def collect_data(category_list=None, max_page_count=0):
  # a. scrape data from the link
  # b. parse it
  # c. final result stored as a list of rows
  rows=[]
  print(max_page_count)
  print(category_list)
  for category in category_list:
    print('Category:'+category)
    base_url = "https://www.thinkslogans.com/slogans/advertising-slogans/"+str(category)+"-slogans/"
    for page in range(max_page_count):
      page = page + 1
      url = base_url
      if(page != 1):
        url = base_url + "page/"+str(page)+"/"
      print("Page Number:"+str(page))
      
      response = requests.head(url)
      
      #if response.status_code != 200:
        #break
      data = get_data(url)			
      soup = BeautifulSoup(data,'html.parser')
      #org_names = soup.findAll('h5',{'class':'list-group-item-heading'})
      org_slogans = soup.findAll('div',{'class':'entry'})
      org_slogans = list(str(org_slogans).split("<\div>"))
      for i in range(0,len(org_slogans)):
        try:
            raw_line = remove_html_tags(org_slogans[i])
            print(raw_line)
            raw_line = (raw_line.strip()).split("\n\n")
            raw_line = [x for x in raw_line if x != " " and x != ""]
            for i in range(0,len(raw_line)):
                if("\n" in raw_line[i]):
                  row = [raw_line[i].split("\n")[0], category, raw_line[i].split("\n")[1]]
                  rows.append(row)
        except AttributeError:
            pass
  return rows

def write_data(data, output_file):
  
  # write data to the output file
  with open(output_file, 'w', newline='') as file:
    print(data)
    writer = csv.writer(file)
    # header of the csv
    writer.writerow(['Slogan', 'Category', 'Company'])
    # contents
    writer.writerows(data)
  pass

def main():
  data = collect_data(category_list, max_page_count)
  write_data(data, output_file)
  return 0

if __name__ == '__main__':
  sys.exit(main())

In [None]:
import pandas as pd
#@markdown The "slogans.csv" file still had to be preprocessed before its usage: for this reason, another step had to be carried out. <br> Since we want our model to be trained on a question-response type of task, 
#@markdown the dataset had to be composed by questions, which are of the type <i>"What could it be a good advertising slogan for a company called " + <b>company name</b> + " which operates in the " + <b>company category</b> + " field?"</i><br> The response will be the slogan itself

dataset = pd.read_csv("/content/drive/MyDrive/sloganlist.csv")
model_friendly_dataset = pd.DataFrame(columns = ["question", "answer"])

for i in range (0, len(dataset)):
  row = dataset.iloc[i]
  question_string = "What could it be a good advertising slogan for a company called " + row['Company'] + " which operates in the " + row['Category'] + " field?"
  model_friendly_dataset.loc[i] = [question_string, row['Slogan']]

model_friendly_dataset.to_csv("preprocessed.csv", index=False)


In [None]:
#@title Scraping for the image dataset
#@markdown Moving on the second part, we wanted to see if an image generation model could be able to produce an advertisement banner
#@markdown starting from the same parameters for the textual part, adding the slogan produced by the previous model to the input prompt. In order to do this, we also decided to use the following code to scrape around 5000 advertisement images

import requests

for i in range(0, 5):
  j = 0
  for j in range (0, 100000):
    base_url = f'https://people.cs.pitt.edu/~mzhang/image_ads/{i}/{j}.jpg'
    if(requests.get(base_url).status_code != 404):
      print("Immagine ottenuta: " + base_url)
      img_data = requests.get(base_url).content
      filepath = "/content/advertisements/ad_" + str(i) + "_" + str(j) + ".jpg" 
      with open(filepath, 'wb') as handler:
          handler.write(img_data)
    else:
      print("immagine non ritrovata: " + base_url)