In [None]:
from bs4 import BeautifulSoup as bs 
import requests
import csv

<h4>Collect emails from a website and store the results into a csv file<h4>

### 1st website

In [None]:
# specify website and download into 'soup' variable
website = 'URL HERE' # place your link on the quotes
source = requests.get(website).text

soup = bs(source, 'lxml')

In [None]:
# This will print out the html text in a readable way
# it helps to ensure that the information was downloaded correctly
print(soup.prettify())

In [None]:
# create a csv file to store the emails
csv_file = open('emails_scraped.csv', 'a') # the 'a' appends to an existing csv. A 'w', for example, 
                                           # would write over the existing content
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['email'])

# loop through all the emails and store them onto csv
for link in soup.find_all('a', class_='email'):
    email = link.text                      # this extracts the text from the html code
    print(email)                           # this just shows me what the code is doing 
    csv_writer.writerow([email])           # the code writes the email in a new excel row
    
csv_file.close()

### 2nd website

In [None]:
# specify website and download into 'soup' variable
website = 'URL HERE'
source = requests.get(website).text
soup = bs(source, 'lxml')

# create a csv file to store the emails
csv_file = open('emails_scraped.csv', 'a')
csv_writer = csv.writer(csv_file)

# loop through all the emails and store them onto csv
for link in soup.find_all('a', class_='btn btn-main btn-sm email'):
    link = link.get('href')             # notice this doesn't use .text()!
    email = link[7:]                    # notice that I'm slicing the first 7 characters off
    print(email)
    csv_writer.writerow([email])
    
csv_file.close()

### 3rd Website

#### This website has some null values in it, which stops the loop I used in the previous examples. Instead, I included some conditional statements in order to skip these values and continue on to the next email. 

In [None]:
# specify website and download into 'soup' variable
website = 'URL HERE'
source = requests.get(website).text
soup = bs(source, 'lxml')
#print(soup.prettify())

In [None]:
# create a csv file to store the emails
csv_file = open('emails_scraped.csv', 'a')
csv_writer = csv.writer(csv_file)

# loop through all the emails and store them onto csv
for link in soup.find_all('div', class_='tertiary'):
    try:
        link = link.find('a')        # some of the "div" tags have null values and cannot be extracted. This stops my code. Look below for how to fix!
        email = link['href']         # notice how I used square brackets instead of .get()
        email = email[7:]          
        print(email)
        csv_writer.writerow([email])
    except: link = None             # this tells my code to not perform the loop if the value is a NoneType / null
    
csv_file.close()

### 4th Website 

In [None]:
# specify website and download into 'soup' variable
website = 'URL HERE'
source = requests.get(website).text
soup = bs(source, 'lxml')
#print(soup.prettify())

In [None]:
# create a csv file to store the emails
csv_file = open('emails_scraped.csv', 'a')
csv_writer = csv.writer(csv_file)

# loop through all the emails and store them onto csv
for link in soup.find_all('div', class_='details'):
    try:
        link = link.find('a')
        link = link['href'] 
        email = link[7:]
        print(email)
        csv_writer.writerow([email])
    except: link = None 
csv_file.close()

### 5th website

#### This was the most challenging code. This website had multiple 'a' tags inside the parent 'div' tag, and only one of them (in each 'div') contained the email in the 'href' section. Thus, webscraping would only return the values in which the email was positions on the first 'a' tag, and this was the case for only SOME of the emails.

#### So, instead of scraping the information directly as I did in all previous examples, I did the following:
#### 1. I placed all the 'a' tags in a list, which would automatically save as tuples
#### 2. I converted the tuples into lists elements so that I could manipulate the values
#### 3. Converted each element into a string
#### 4. Verified that the word "mailto" was somewhere in that element
#### 5. If it was in the element, then it would slice out the first 36 characters and the last 76 characters and add it to the csv file
#### 6. If not, then print "no". This helped me verify that the code is doing what I intend for it to do
#### 7. Finally, the loop increased 'i by 1 so it could move on to the next element

In [None]:
# specify website and download into 'soup' variable
website = 'YOUR URL'
source = requests.get(website).text
soup = bs(source, 'lxml')
#print(soup.prettify())

In [None]:
# create a csv file to store the emails
csv_file = open('emails_scraped.csv', 'a')
csv_writer = csv.writer(csv_file)

# initiate i
i = 1

# loop through all the 'a' tags and store them on a variable (it will automatically store as a touple)
links = soup.find_all('a')
# convert the touple into a list and enumerate each one
link_list = [list(link) for link in enumerate(links)] # This is a one-line for loop. Here's a tutorial: https://www.youtube.com/watch?v=cGH8cVhj7H4 

# loop through each element of the link_list
for link in link_list:
    try:
        for link in link_list:
            # set variables
            current_link = str(link_list[i][1])        # convert item into string
            email_isolated = current_link[36:-76]      # remove the first 36 and the last 76 characters 
            
            
            if 'mailto' in current_link:
                csv_writer.writerow([email_isolated])
                print(email_isolated)
            else:
                print('no')

            # add 1 to i
            i = i+1
  #csv_writer.writerow([email])
    except: link = None             
    
csv_file.close()