In [2]:
import pandas as pd
import numpy as np
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException

In [3]:
# importing the csv into a dataframe, csv file should be in same directory as python script
members_csv = pd.read_csv("slack-ieee-ucb-members.csv")
members_csv

Unnamed: 0,username,email,status,billing-active,has-2fa,has-sso,userid,fullname,displayname,expiration-timestamp
0,jared,rulison@berkeley.edu,Deactivated,0,0,0,U1J8N2NSF,Jared Rulison,jared,
1,vicky11z,vicky11z@berkeley.edu,Deactivated,0,0,0,U1J9V8VRC,Vicky Zhang,vicky11z,
2,ieeeucb,ieeeucb@gmail.com,Primary Owner,0,0,0,U1J9ZTAJ3,IEEE Account,ieeeucb,
3,kevinma,kevinma.sd@berkeley.edu,Member,0,0,0,U1JAHG117,Kevin Ma,kevinma,
4,cporter,cporter15@berkeley.edu,Deactivated,0,0,0,U1KSG2XLH,Chase Porter,cporter,
...,...,...,...,...,...,...,...,...,...,...
784,omairkhan,omairkhan@berkeley.edu,Member,1,0,0,U01J6DP124B,Omair Khan,Omair Khan,
785,leobookey,leobookey@berkeley.edu,Member,1,0,0,U01JB3Q4FNF,Leo Bookey,Leo Bookey,
786,rexliu3,rexliu3@berkeley.edu,Member,1,0,0,U01JGSCP96J,Rex Liu,Rex Liu,
787,zoehinks,zoehinks@berkeley.edu,Member,1,0,0,U01JSFZ6RK5,Robin Hinks,Robin Hinks,


In [4]:
# filtering out deactivated accounts
not_deactivated = members_csv['status'] != 'Deactivated'
members_csv = members_csv[not_deactivated]

# filtering out active members 
inactive = members_csv['billing-active'] != 1
members_csv = members_csv[inactive]

members_csv

Unnamed: 0,username,email,status,billing-active,has-2fa,has-sso,userid,fullname,displayname,expiration-timestamp
2,ieeeucb,ieeeucb@gmail.com,Primary Owner,0,0,0,U1J9ZTAJ3,IEEE Account,ieeeucb,
3,kevinma,kevinma.sd@berkeley.edu,Member,0,0,0,U1JAHG117,Kevin Ma,kevinma,
13,daminig,daminig@berkeley.edu,Member,0,0,0,U2AEXK1R9,Damini Grover,daminig,
20,brent,brentyi@berkeley.edu,Admin,0,0,0,U2D1SNSUE,Brent Yi,brent,
24,billz12oz,william_zhao@berkeley.edu,Member,0,0,0,U2D2AHPCZ,Billy Zhao,billz12oz,
...,...,...,...,...,...,...,...,...,...,...
777,eustyn_trinh,eustyn_trinh@berkeley.edu,Member,0,0,0,U01DN805KEG,Waterstar1,Waterstar1,
779,richard95lee,richard95lee@berkeley.edu,Member,0,0,0,U01E93SKUPL,Richard Lee,Richard Lee,
780,scheduler,botuser-T1JAA6H2N-B01EPJZ3EHH@slack-bots.com,Bot,0,0,0,U01F220B6U9,Message Scheduler,,
781,xr,xr@berkeley.edu,Member,0,0,0,U01F89GEPS5,Ray Xi,Ray Xi,


In [5]:
member_activity = members_csv[['username', 'email', 'status', 'fullname']].copy()
member_activity['in_directory'] = 'nan' 
member_activity

Unnamed: 0,username,email,status,fullname,in_directory
2,ieeeucb,ieeeucb@gmail.com,Primary Owner,IEEE Account,
3,kevinma,kevinma.sd@berkeley.edu,Member,Kevin Ma,
13,daminig,daminig@berkeley.edu,Member,Damini Grover,
20,brent,brentyi@berkeley.edu,Admin,Brent Yi,
24,billz12oz,william_zhao@berkeley.edu,Member,Billy Zhao,
...,...,...,...,...,...
777,eustyn_trinh,eustyn_trinh@berkeley.edu,Member,Waterstar1,
779,richard95lee,richard95lee@berkeley.edu,Member,Richard Lee,
780,scheduler,botuser-T1JAA6H2N-B01EPJZ3EHH@slack-bots.com,Bot,Message Scheduler,
781,xr,xr@berkeley.edu,Member,Ray Xi,


In [37]:
# an example of when the email exists in the directory
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
does_not_exist = 'No matches to your search. Please try again.\n'
test_URL_1 = "https://www.berkeley.edu/directory/results?search-term=indianjit%40berkeley.edu" 
test_page_1 = requests.get(test_URL_1, headers=headers)

test_soup_1 = BeautifulSoup(test_page_1.content, 'html.parser')
search_results_1 = test_soup_1.find_all('section', {'class': 'search-results'})

for result in search_results_1:
    print(does_not_exist in result.text)
    print(result.text)

False

Indianjit Singh
Emailindianjit@berkeley.edu
UID1718546



In [36]:
# an example of when the email does not exist in the directory 
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
does_not_exist = 'No matches to your search. Please try again.'
test_URL_2 = 'https://www.berkeley.edu/directory/results?search-term=brentyi%40berkeley.edu'
test_page_2 = requests.get(test_URL_2, headers=headers)

test_soup_2 = BeautifulSoup(test_page_2.content, 'html.parser')
search_results_2 = test_soup_2.find_all('section', {'class': 'search-results'})

for result in search_results_2:
    print(does_not_exist in result.text)
    print(result.text)

True

No matches to your search. Please try again.



In [9]:
# scraping @berkeley.edu emails from the cal directory, stalling between requests to avoid 403 forbidden 
base_URL = "https://www.berkeley.edu/directory/results"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
directory = requests.get(base_URL)

for i in range(len(member_activity)):
    member = member_activity.iloc[i]
    query = '?search-term=' + member['email'].replace('@', '%40')
    full_query = base_URL + query
    search_results = ''
    page = requests.get(full_query, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    time.sleep(5)
    
    print(full_query)

https://www.berkeley.edu/directory/results?search-term=ieeeucb%40gmail.com
https://www.berkeley.edu/directory/results?search-term=kevinma.sd%40berkeley.edu
https://www.berkeley.edu/directory/results?search-term=daminig%40berkeley.edu


KeyboardInterrupt: 