# Table of Contents

* [Intialize](#Initalize)
* [Scrapper](#Scraper)
* [Clean Data](#Clean-Data)
* [Store as CSV](#Store-as-CSV)
* [Appendix](#Appendix)
  * [Load Data from Roster Dictionary](#Load-From-Roster)
  * [Load Data from CSV](#Load-From-CSV)

# Initalize

**Jump-to**: [Table of Content](#Table-of-Contents) | [Scrapper](#Scrapper) | [Clean Data](#Clean-Data) | [Store as CSV](#Store-as-CSV) | [Load Data from Roster Dictionary](#Load-From-Roster) | [Load Data from CSV](#Load-From-CSV)

In [None]:
import pandas as pd
import numpy as np
import requests
import random
import time
import datetime as dt
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

from env import muser, mpass
print('Init')

# Scrapper

**Jump-to**: [Table of Content](#Table-of-Contents) | [Scrapper](#Scrapper) | [Clean Data](#Clean-Data) | [Store as CSV](#Store-as-CSV) | [Load Data from Roster Dictionary](#Load-From-Roster) | [Load Data from CSV](#Load-From-CSV)

In [None]:
#code by pythonjar - grants browser permission 
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)

#specify the path to chromedriver.exe 
PATH = 'C:/Program Files (x86)/chromedriver.exe'
#creates the driver object with the chrome driver and options as peramaters
driver = webdriver.Chrome(PATH, options=chrome_options)

#opens FB
driver.get("http://www.facebook.com")

# waits for FB to load and display the login boxes using CSS_selector types
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='email']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='pass']")))

#clears the user name and password fields and fills them with FB credentials
username.clear()
username.send_keys(muser)
password.clear()
password.send_keys(mpass)

#waits 5 seconds or until the login button is displayed and then clicks it to login
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click() 

#wait 30 seconds to get security code for 2way auth
time.sleep(30)

# wait a random amount of time for the page to load before trying to navigate to the group members page
# Note - if facebook limits this behavior, simulating clicks might be necessary
time.sleep(random.randrange(4, 11))

# create an empty list obj to store the list of dictionaries
rosters = []

# create a list of group member pages to pull rosters from replace the links inside the [] with the group you want to look at
group = 'https://www.facebook.com/groups/tribunecommons/members/'


# loop through the group urls
driver.get(group)
time.sleep(random.randrange(5, 10))
last_height = driver.execute_script("return document.body.scrollHeight")

# use a while true loop to scroll to the end of a group's member list before pulling the page_source
while True:

    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page a random amount of time between 6 and 12 seconds
    time.sleep(random.randrange(6, 12))

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        # If heights are the same it will exit the function
        break
    last_height = new_height
    
    
html = driver.page_source
# create a soup object using html parser to use to prase the results
soup = BeautifulSoup(html, "html.parser")
# create a list of all div cards that contains the information for each field we want to dump into a DF
# Note the class values are likely to change, check it yourself.
divs = soup.findAll('div', class_='ow4ym5g4 auili1gw rq0escxv j83agx80 buofh1pr g5gj957u i1fnvgqd oygrvhab cxmmr5t8 hcukyx3x kvgmc6g5 nnctdnn4 hpfvmrgz qt6c0cv9 jb3vyjys l9j0dhe7 du4w35lb bp9cbjyn btwxx1t3 dflh9lhu scb9dxdr')

# Create a for loop that breaks into each div (data card) and find each instance of each piece of info 
# check each class label before running as Facebook changes these often
for div in divs:
    names = div.findAll('a', class_='oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl oo9gr5id gpro0wi8 lrazzd5p')

    # Create empty objects to store the pulled text in the for loops for each piece of data
    person = ''
    date =  pd.to_datetime('today').date()

    # loop through each div obj to pull the required information. This is required because the info is in list form
    # some fields are blank hence doing if statements in for loops
    for name in names:
        person = name.text


    # create a dictionary that contains the information for each member from each div obj
    dicob = {'name':person,
             'active':date}
    # append the dictionary to the list
    rosters.append(dicob)
# extra sleep time before moving onto the next group
time.sleep(8)

# close the driver
driver.close()

# Clean Data

**Jump-to**: [Table of Content](#Table-of-Contents) | [Scrapper](#Scrapper) | [Clean Data](#Clean-Data) | [Store as CSV](#Store-as-CSV) | [Load Data from Roster Dictionary](#Load-From-Roster) | [Load Data from CSV](#Load-From-CSV)

In [None]:
# Before we get the dictionary into a DF, we need to create a list of people we don't want to grab. 
# This list will include all modmins and blocked people. 
# Note - this currently doesn't account for people with the same name but different facebook IDs. This can be fixed later

ignore = ['Anthony Rivera Straine', 'Frank Straine', 'CrowdSourced Tribune', 'Mateo DeGall', 
         'Shawn Dixon', 'Jennifer P. Travis', 'Nikolette Adams', 'Lynn Paluga', 'Andrew Harris', 
         'David Marc Grant', 'Aaron Bratton', 'Peter William Essig', 'Vanessa Bibb-Cook', 
          'Ellen Grizwold', 'Ed Cummings', 'Kim Eckhoff', 'Nozofmary Ham']

len(ignore)

In [None]:
# convert the roster list to a panda's data frame and drop any duplicate values
retention = pd.DataFrame(rosters).drop_duplicates()
# drop any row where the name = 'Learn More', 'YOUR NAME', or is blank (name = '')
retention = retention[(retention['name'] != 'Learn More') & (retention['name'] != '') & (~retention['name'].isin(ignore))]
# reset the index and drop the original index as it's values do not matter
retention.reset_index(drop=True, inplace=True)
retention

# Store as CSV

This finalizes the recurit DF (for now). From here we can store the results as a useable CSV

**Jump-to**: [Table of Content](#Table-of-Contents) | [Scrapper](#Scrapper) | [Clean Data](#Clean-Data) | [Store as CSV](#Store-as-CSV) | [Load Data from Roster Dictionary](#Load-From-Roster) | [Load Data from CSV](#Load-From-CSV)

In [None]:
current = pd.read_csv('Retention.csv')
current.drop(columns=["Unnamed: 0"], inplace=True)
current

In [None]:
retention = pd.concat([current, retention]).reset_index(drop=True)
retention

In [None]:
retention.to_csv('Retention.csv')

----

# Appendix

### Load from Roster

Use the cell below if you've already acquired the roster List and want to turn it into a CSV

**Jump-to**: [Table of Content](#Table-of-Contents) | [Scrapper](#Scrapper) | [Clean Data](#Clean-Data) | [Store as CSV](#Store-as-CSV) | [Load Data from Roster Dictionary](#Load-From-Roster) | [Load Data from CSV](#Load-From-CSV)

In [None]:
##########################################################
### cell for recaputring dataframe from stored rosters ###
##########################################################

# convert the roster list to a panda's data frame and drop any duplicate whole row values
recruit = pd.DataFrame(rosters).drop_duplicates()
# drop any row where the name = 'Learn More', 'your name', or is blank (name = '')
recruit = recruit[(recruit['name'] != 'Learn More') & (recruit['name'] != 'Anthony Rivera Straine') & (recruit['name'] != '')]
# reset the index and drop the original index as it's values do not matter
recruit.reset_index(drop=True, inplace=True)


recruit.to_csv("Recruitment.csv")

### Load From CSV

Use the cell below if you've already acquired the CSV and want to make changes to it

**Jump-to**: [Table of Content](#Table-of-Contents) | [Scrapper](#Scrapper) | [Clean Data](#Clean-Data) | [Store as CSV](#Store-as-CSV) | [Load Data from Roster Dictionary](#Load-From-Roster) | [Load Data from CSV](#Load-From-CSV)

In [None]:
######################################################
### cell for recaputring dataframe from stored csv ###
######################################################

current = pd.read_csv('Retention.csv')
current.drop(columns=["Unnamed: 0"], inplace=True)
#current['retained'] = 1
current = current.sort_values(by=["name", "active"]).reset_index(drop=True)
current['active'] = pd.to_datetime(current['active'])
current['active'] = pd.to_datetime(current['active']).dt.to_period('M')
current

In [None]:
type1 = current.groupby(['name'])[['active']].shift(1)
type1.columns=(['previous_active'])
type2 = current.groupby(['name'])[['active']].shift(-1)
type2.columns=(['next_active'])
analysis = pd.concat([current, type1], axis=1)
analysis = pd.concat([analysis, type2], axis=1)
analysis

In [None]:
analysis[analysis['name']=="Chris Soria"]

In [None]:
pivotz = current.pivot(index='name', columns='active', values='retained')
pivotz[(pivotz['2021-05-04'] == 1) & ((pivotz['2021-06-23'].isnull()) | (pivotz['2021-07-22'].isnull()))]

----