# Task 
- Download the CSV file from the link- https://drive.google.com/file/d/1PLYwrGn5YApyWU2QpjbdhM6tea0HuGq7/view?usp=sharing
- Scrape the Twitter profile with Python Selenium or Beautiful Soup
- Details needed are - `Bio, Following Count, Followers Count, Location, Website(If available)`
- The program should create a CSV file with the above columns.

The code should be well commented and optimized, there will be extra marks for that
Make a short video explaining and running the task in <10mins. 

Don't need to show long-running codes.

Upload your code to a GitHub public repository
Submit the video and the GitHub link for the same in the submission form


Scrape the Twitter profile with Python Selenium from the links stored in csv file. Obtain `Bio, Following Count, Followers Count, Location, Website` from the profile if available.

In [3]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys


In [4]:
def launch_chrome(link, headless=False):
    ops = Options()  # create object of Options

    if headless:
        ops.add_argument('--headless')  # headless browser testing

    # prevents browser from closing when function is returned
    ops.add_experimental_option("detach", True)

    driver = webdriver.Chrome(options=ops,  service=Service(
        ChromeDriverManager().install()))

    driver.get(link)
    return driver


In [5]:
def get_element(driver, path, timeout):
    element_present = None

    try:
        element_present = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.XPATH, path)))

    except TimeoutException:
        print("Timed out waiting for page to load and finding the required web element.")
        print(path)
    finally:
        return element_present


## Get links for csv file

In [6]:
links = []

with open('twitter_links.csv', 'r') as links_csv:
    rows = csv.reader(links_csv)
    # displaying the contents of the CSV file

    for row in rows:
        links.append(row[0])

print(links)

['https://twitter.com/GTNUK1', 'https://twitter.com/whatsapp', 'https://twitter.com/aacb_CBPTrade', 'https://twitter.com/aacbdotcom', 'https://twitter.com/@AAWindowPRODUCT', 'https://www.twitter.com/aandb_kia', 'https://twitter.com/ABHomeInc', 'https://twitter.com/Abrepro', 'http://www.twitter.com', 'https://twitter.com/ACChristofiLtd', 'https://twitter.com/aeclothing1', 'http://www.twitter.com/', 'https://twitter.com/AETechnologies1', 'http://www.twitter.com/wix', 'https://twitter.com/AGInsuranceLLC']


In [7]:
xpaths = {
    'BIO':
    '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div[1]/div/div[3]/div/div/div/div/div[3]',

    'LOCATION':
    '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div[1]/div/div[3]/div/div/div/div/div[4]/div/span[1]/span',

    'FOLLOWING':
    '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div[1]/div/div[3]/div/div/div/div/div[5]/div[1]/a/span[1]/span',

    'FOLLOWERS':
    '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div[1]/div/div[3]/div/div/div/div/div[5]/div[2]/a/span[1]/span',

    'WEBSITE':
    '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div[1]/div/div[3]/div/div/div/div/div[4]/div/a/span'
    
    }

In [8]:
def get_details(drive):

    bio = get_element(drive, xpaths['BIO'], timeout=10)
    location = get_element(drive, xpaths['LOCATION'], timeout=10)
    following = get_element(drive, xpaths['FOLLOWING'], timeout=10)
    followers = get_element(drive, xpaths['FOLLOWERS'], timeout=10)
    website = get_element(drive, xpaths['WEBSITE'], timeout=10)
    elements = [bio, location, following, followers, website]

    details = []
    for element in elements:
        if element:
            details.append(element.text)
        else:
            details.append(None)

    print(details)
    return details

In [10]:
fields = list(xpaths.keys())
with open('twitter_data.csv', 'a', encoding='utf8', newline='') as twt:

    # using csv.writer method from CSV package
    write = csv.writer(twt)
    write.writerow(["Sr. No.", "LINK"] + fields)
    
    for i, link in enumerate(links):

        tdriver = launch_chrome(link)
        details = get_details(tdriver)
        write.writerow([i+1, link]+details)

        tdriver.close()
        tdriver.quit()

['Providing Entertainment & Travel to Commercial Radio. Reaching 28.9M weekly listeners. Winners of The Arqiva National Sales Team of the Year 2010, 2011 & 2016', 'London, England', '463', '126', 'gtn.uk.com/index.php']
['Happy #PrideMonth \n#CrossingCultures Ep 2, out now ', 'California', '2', '4.8M', 'bit.ly/3IRGfXH']
Timed out waiting for page to load and finding the required web element.
//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div[1]/div/div[3]/div/div/div/div/div[4]/div/a/span
['Customs Broker', 'Florida, USA', '125', '31', None]
['A & A Freight | Warehousing | Customs Brokerage | Helping people ship across borders.', 'Worldwide', '4,078', '665', 'aacb.com']
['A commercial glass and glazing company serving the window industry in New England since 1954. #SafetyQualityService', 'Malden, MA', '90', '76', 'aawindowproducts.com']
['A&B Kia is a Kia dealer in Benwood, WV. Stay connected to exceed expectations. Build strong relationships. Drive the best with us.', 'Benwood,

**Time out message is because the website could not be found in certain profiles**

In [11]:
import pandas as pd

**Fixing new line characters in bio that affected the format of row in csv**

In [19]:
df = pd.read_csv('twitter_data.csv')

df["BIO"] = df["BIO"].str.replace('\n', '\\n')

df.to_csv("twitter_data.csv", index=False)