# 

In [1]:
import requests
import time
import re
from tqdm import tqdm
import datetime
import pandas as pd

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'authority': 'www.google.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'max-age=0',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
}

In [2]:
pattern = re.compile(r"[\\/.,\"]") 

def clear_company_name(company_name):
    company_name = company_name.lower()
    company_name = re.sub(r"[\\/(].*", '', company_name)
    company_name = pattern.sub('', company_name)
    company_name = company_name.strip()
    company_name = company_name.replace(' ', '+')
    return company_name

In [None]:
def get_datetime_now():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
dataset = pd.read_csv('ECL.csv', index_col=0)

In [None]:
datetime_of_initial_processing = datetime.datetime.strptime('2021-10-01 00:00:00', "%Y-%m-%d_%H-%M-%S")

prediction_subset = pd.read_csv(f'ECL_{datetime_of_initial_processing}.csv', index_col=0)
unique_companies = pd.read_csv(f'ECL_unique_companies_{datetime_of_initial_processing}.csv', index_col=0)

In [None]:
tqdm.pandas()

found_companies_count = 0

companies_with_not_found_tickers_list = []
companies_with_not_found_tickers_with_exception = []

pbar = tqdm(unique_companies.values.tolist())

exceptions_list = []

for cik, company in pbar:

    cleared_company_name = clear_company_name(company)
    url = f'https://www.gurufocus.com/reader/_api/_search?v=1.4.13&text={cleared_company_name}'
    
    try:
        
        try:
            response = requests.get(url, headers=headers)
            response_json = response.json()
        except Exception as ex:
            companies_with_not_found_tickers_with_exception.append({'cik': cik, 'company': company})
            exceptions_list.append(ex)
            time.sleep(0.7)
            continue

        time.sleep(0.2)
    
        if len(response_json) == 0:
            has_company_name_changed = False
            if '+inc' in cleared_company_name:
                cleared_company_name = cleared_company_name.replace('+inc', '')
                has_company_name_changed = True
            if '+co' in cleared_company_name:
                # may match 'co' in any word
                cleared_company_name = cleared_company_name.replace('+co', '+company')
                has_company_name_changed = True
            if '+llc' in cleared_company_name:
                cleared_company_name = cleared_company_name.replace('+llc', '')
                has_company_name_changed = True
            cleared_company_name = cleared_company_name.strip()
            
            if has_company_name_changed:
                url = f'https://www.gurufocus.com/reader/_api/_search?v=1.4.13&text={cleared_company_name}'
                try:
                    response = requests.get(url, headers=headers)
                    response_json = response.json()
                except Exception as ex:
                    companies_with_not_found_tickers_with_exception.append({'cik': cik, 'company': company})
                    exceptions_list.append(ex)
                    time.sleep(0.7)
                    continue
                
            if len(response_json) == 0:
                companies_with_not_found_tickers_list.append({'cik': cik, 'company': company})
                continue
        
        is_company_found = False
        matched_type_counter = 0
        for entry in response_json:
            if entry['type'] not in ('stock', 'delisted'):
                if matched_type_counter > 0:
                    break
                continue
                
            exchange = entry['data']['exchange']
            if exchange in ('NYSE', 'NAS', 'DELISTED'):
                ticker = entry['data']['symbol']
                stockid = entry['data']['stockid']
                # gurufocus_company_name = entry['data']['company']
                
                if matched_type_counter == 0:
                    prediction_subset.loc[prediction_subset['cik'] == cik, 'ticker'] = ticker
                    prediction_subset.loc[prediction_subset['cik'] == cik, 'exchange'] = exchange
                    prediction_subset.loc[prediction_subset['cik'] == cik, 'gurufocus-stockid'] = stockid
                    # prediction_subset.loc[prediction_subset['cik'] == cik, 'gurufocus-company-name'] = gurufocus_company_name
                    
                    
                    is_company_found = True
                    found_companies_count += 1
                    pbar.set_description(f"Found companies count: {found_companies_count}")
                    matched_type_counter += 1
                    
                elif matched_type_counter == 1:
                    prediction_subset.loc[prediction_subset['cik'] == cik, 'second-match-ticker'] = ticker
                    prediction_subset.loc[prediction_subset['cik'] == cik, 'second-match-exchange'] = exchange
                    prediction_subset.loc[prediction_subset['cik'] == cik, 'second-match-gurufocus-stockid'] = stockid
                    # prediction_subset.loc[prediction_subset['cik'] == cik, 'second-match-gurufocus-company-name'] = gurufocus_company_name
                else:
                    break
                
        if not is_company_found:
            companies_with_not_found_tickers_list.append({'cik': cik, 'company': company})
            
    except Exception as ex:
        companies_with_not_found_tickers_with_exception.append({'cik': cik, 'company': company})
        exceptions_list.append(ex)
        time.sleep(0.7)
        
companies_with_not_found_tickers_df = pd.DataFrame(companies_with_not_found_tickers_list)
companies_with_not_found_tickers_with_exception_df = pd.DataFrame(companies_with_not_found_tickers_with_exception)

In [None]:
print(f"Number of companies with found tickers: {found_companies_count}")
print(f"Number of companies with not found tickers: {len(companies_with_not_found_tickers_list)}")
print(f"Number of companies with exceptions: {len(companies_with_not_found_tickers_with_exception_df)}")

In [None]:
datetime_now = get_datetime_now()

prediction_subset.to_csv(f'ECL_with_ticker_{datetime_now}.csv')
companies_with_not_found_tickers_df.to_csv(f'ECL_companies_with_not_found_tickers_{datetime_now}.csv')
companies_with_not_found_tickers_with_exception_df.to_csv(f'ECL_companies_with_not_found_tickers_with_exception_{datetime_now}.csv')

## Download data from gurufocus.com
#### Download data for companies with one ticker (then for companies with two tickers)

In [None]:
prediction_subset = pd.read_csv('ECL_with_ticker_2024-04-14_15-20-33.csv', index_col=0)

In [None]:
prediction_subset_with_gurufocus_data = prediction_subset.groupby('cik').agg(
    company=('company', 'last'),
    ticker=('ticker', 'last'),
    second_match_ticker=('second-match-ticker', 'last'),
    gurufocus_stockid=('gurufocus-stockid', 'last'),
    second_match_gurufocus_stockid=('second-match-gurufocus-stockid', 'last')
).reset_index()

In [None]:
# get companies with one ticker
companies_with_one_ticker = prediction_subset_with_gurufocus_data[prediction_subset_with_gurufocus_data['ticker'].notnull() & prediction_subset_with_gurufocus_data['second_match_ticker'].isnull()]
print(len(companies_with_one_ticker))
print(companies_with_one_ticker.head(10))

In [None]:
companies_with_one_ticker_grouped_by_cik = companies_with_one_ticker.groupby('cik').agg(
    ticker=('ticker', 'last'),
    gurufocus_stockid=('gurufocus_stockid', 'last')
).reset_index()
print(len(companies_with_one_ticker_grouped_by_cik))
print(companies_with_one_ticker_grouped_by_cik.head(10))

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from tqdm import tqdm
from bs4 import BeautifulSoup
import json

table_ids = [
    'financial_table_per_share_data',
    'financial_table_ratios',
    'financial_table_income_statement',
    'financial_table_balance_sheet',
    'financial_table_cashflow_statement',
    'financial_table_valuation_ratios',
    'financial_table_valuation_and_quality'
]

options = webdriver.EdgeOptions()

login_url = 'https://www.gurufocus.com/login'
payload = {
    "username": "darekkruszel15@gmail.com",
    "password": "=OcUZ*5&|{+l7-lGy:QMe4vHyF4X'~"
}

def process_df(df):
    df.drop(df.columns[1], axis=1, inplace=True) 
    return df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [None]:
# login
driver = webdriver.Edge(options=options)
driver.get(login_url)

username = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'username')))
password = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'password')))
username.clear()
password.clear()
username.send_keys(payload['username'])
password.send_keys(payload['password'])
password.send_keys(Keys.RETURN)

WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.LINK_TEXT, 'Articles')))
time.sleep(1)


not_found_financial_data_with_exception = []
not_found_financial_data_with_webdriver_timeout_exception = []
success_count = 0

pbar = tqdm(companies_with_one_ticker_grouped_by_cik.values.tolist())
try:
    for cik, ticker, stockid in pbar:
        
        url = f'https://www.gurufocus.com/stock/{stockid}/financials'
        
        try:
            driver.get(url)
            time.sleep(0.7)
    
            try:
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_per_share_data')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_ratios')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_income_statement')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_balance_sheet')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_cashflow_statement')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_valuation_ratios')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_valuation_and_quality')))
            except Exception as ex:
                not_found_financial_data_with_webdriver_timeout_exception.append([cik, stockid, ticker, str(ex)])
                continue
    
            page_source = driver.page_source
            with open(f'gurufocus-immediate-response-for-stockid/page_source-{cik}-{stockid}_{ticker}.txt', 'w', encoding='utf-8') as f:
                f.write(page_source)
            
            soup = BeautifulSoup(page_source, 'html.parser')
        
            gurufocus_company_name = soup.find('div', {'id': 'stock-header'}).find('div').text
    
            prediction_subset.loc[prediction_subset['cik'] == cik, 'gurufocus-company-name'] = gurufocus_company_name
        
            tables = soup.find_all('table')
            
            page_tables_ids = []
            for table in tables:
                try:
                    table_id = table.get('id')
                    if table_id:
                        page_tables_ids.append(table_id)
                except:
                    pass
            
            if all(table_id not in page_tables_ids for table_id in table_ids):
                pass
            else:
                merged_df = pd.DataFrame()
                for table_id in table_ids:
                    table = soup.find(id=table_id)
                    df = pd.read_html(str(table), skiprows=1, header=0)[0]
                    df = process_df(df)
                    merged_df = pd.concat([merged_df, df])
                    
                merged_df.reset_index(drop=True, inplace=True)
                merged_df.to_csv(f'financial_data/{cik}-{stockid}_{ticker}.csv')
                
                success_count += 1
                pbar.set_description(f"Success: {success_count}")
    
        except Exception as ex:
            not_found_financial_data_with_exception.append([cik, stockid, ticker, str(ex)])

finally:
    driver.close()
    prediction_subset.to_csv(f'ECL_with_ticker_{datetime_now}.csv')

    with open('not_found_financial_data_with_exception.json', 'w') as file:
        json.dump(list(not_found_financial_data_with_exception), file)
        
    with open('not_found_financial_data_with_webdriver_timeout_exception.json', 'w') as file:
        json.dump(list(not_found_financial_data_with_webdriver_timeout_exception), file)

In [4]:
# login
driver = webdriver.Edge(options=options)
driver.get(login_url)

username = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'username')))
password = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'password')))
username.clear()
password.clear()
username.send_keys(payload['username'])
password.send_keys(payload['password'])
password.send_keys(Keys.RETURN)

WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.LINK_TEXT, 'Articles')))
time.sleep(1)


not_found_financial_data_with_exception = []
not_found_financial_data_with_webdriver_timeout_exception = []
success_count = 0

try:
    stockid = 'US08YY'
    ticker = 'PCG'
    cik = 75488
    # "exchange": "NYSE",
        
    url = f'https://www.gurufocus.com/stock/{stockid}/financials'
    
    try:
        driver.get(url)
        time.sleep(0.7)

        try:
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_per_share_data')))
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_ratios')))
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_income_statement')))
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_balance_sheet')))
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_cashflow_statement')))
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_valuation_ratios')))
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_valuation_and_quality')))
        except Exception as ex:
            not_found_financial_data_with_webdriver_timeout_exception.append([cik, stockid, ticker, str(ex)])

        page_source = driver.page_source
        with open(f'gurufocus-immediate-response-for-stockid/page_source-{cik}-{stockid}_{ticker}.txt', 'w', encoding='utf-8') as f:
            f.write(page_source)
        
        soup = BeautifulSoup(page_source, 'html.parser')
    
        tables = soup.find_all('table')
        
        page_tables_ids = []
        for table in tables:
            try:
                table_id = table.get('id')
                if table_id:
                    page_tables_ids.append(table_id)
            except:
                pass
        
        if all(table_id not in page_tables_ids for table_id in table_ids):
            pass
        else:
            merged_df = pd.DataFrame()
            for table_id in table_ids:
                table = soup.find(id=table_id)
                df = pd.read_html(str(table), skiprows=1, header=0)[0]
                df = process_df(df)
                merged_df = pd.concat([merged_df, df])
                
            merged_df.reset_index(drop=True, inplace=True)
            merged_df.to_csv(f'final_financial_data/{cik}-{stockid}_{ticker}.csv')
            
            success_count += 1
            pbar.set_description(f"Success: {success_count}")

    except Exception as ex:
        not_found_financial_data_with_exception.append([cik, stockid, ticker, str(ex)])

finally:
    driver.close()

    with open('not_found_financial_data_with_exception.json', 'w') as file:
        json.dump(list(not_found_financial_data_with_exception), file)
        
    with open('not_found_financial_data_with_webdriver_timeout_exception.json', 'w') as file:
        json.dump(list(not_found_financial_data_with_webdriver_timeout_exception), file)

NameError: name 'prediction_subset' is not defined

In [None]:
print(f"Number of companies with not found financial data with exception: {len(not_found_financial_data_with_exception)}")
print(f"Number of companies with not found financial data with webdriver timeout exception: {len(not_found_financial_data_with_webdriver_timeout_exception)}")

#### Get companies with two tickers

In [None]:
companies_with_two_tickers = prediction_subset_with_gurufocus_data[prediction_subset_with_gurufocus_data['second_match_ticker'].notnull()]
print(f"Number of companies with two tickers: {len(companies_with_two_tickers)}")
print(companies_with_two_tickers.head(10))

In [None]:
companies_first_tickers = companies_with_two_tickers[['cik', 'company', 'ticker', 'gurufocus_stockid']]
companies_first_tickers['second_match'] = False

companies_second_tickers = companies_with_two_tickers[['cik', 'company', 'second_match_ticker', 'second_match_gurufocus_stockid']]
companies_second_tickers.columns = ['cik', 'company', 'ticker', 'gurufocus_stockid']
companies_second_tickers['second_match'] = True

companies_with_two_tickers_flatten = pd.concat([companies_first_tickers, companies_second_tickers])

print(len(companies_with_two_tickers_flatten))
print(companies_with_two_tickers_flatten.head(10))

In [None]:
companies_with_two_tickers_flatten = companies_with_two_tickers_flatten.sort_values(by='ticker')
print(companies_with_two_tickers_flatten.tail(10))

In [None]:
not_found_financial_data_with_webdriver_timeout_exception_df = pd.read_json('not_found_financial_data_with_webdriver_timeout_exception_two_ticker_case_2024-04-16_22-22-08.json')
not_found_financial_data_with_webdriver_timeout_exception_df.columns = ['cik', 'gurufocus_stockid', 'ticker', 'exception']
not_found_financial_data_with_webdriver_timeout_exception_df.drop(columns=['exception'], inplace=True)
print(not_found_financial_data_with_webdriver_timeout_exception_df.head())

In [None]:
companies_with_two_tickers_flatten = pd.read_csv('companies_with_two_tickers_flatten_2024-04-16_22-22-08.csv', index_col=0)
print(companies_with_two_tickers_flatten.head(10))

In [None]:
# login
driver = webdriver.Edge(options=options)
driver.get(login_url)

username = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'username')))
password = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'password')))
username.clear()
password.clear()
username.send_keys(payload['username'])
password.send_keys(payload['password'])
password.send_keys(Keys.RETURN)

WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.LINK_TEXT, 'Articles')))
time.sleep(1)


not_found_financial_data_with_exception = []
not_found_financial_data_with_webdriver_timeout_exception = []
success_count = 0

pbar = tqdm(not_found_financial_data_with_webdriver_timeout_exception_df.values.tolist())
try:
    for cik, stockid, ticker in pbar:
        
        url = f'https://www.gurufocus.com/stock/{stockid}/financials'
        
        try:
            driver.get(url)
            time.sleep(0.7)
    
            try:
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_per_share_data')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_ratios')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_income_statement')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_balance_sheet')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_cashflow_statement')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_valuation_ratios')))
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.ID, 'financial_table_valuation_and_quality')))
            except Exception as ex:
                not_found_financial_data_with_webdriver_timeout_exception.append([cik, stockid, ticker, str(ex)])
                continue
    
            page_source = driver.page_source
            with open(f'gurufocus-immediate-response-for-stockid-two-tickers-case/page_source-{cik}-{stockid}_{ticker}.txt', 'w', encoding='utf-8') as f:
                f.write(page_source)
            
            soup = BeautifulSoup(page_source, 'html.parser')
        
            gurufocus_company_name = soup.find('div', {'id': 'stock-header'}).find('div').text
    
            mask = (companies_with_two_tickers_flatten['cik'] == cik) & (companies_with_two_tickers_flatten['gurufocus_stockid'] == stockid)
            companies_with_two_tickers_flatten.loc[mask, 'gurufocus-company-name'] = gurufocus_company_name

            tables = soup.find_all('table')
            
            page_tables_ids = []
            for table in tables:
                try:
                    table_id = table.get('id')
                    if table_id:
                        page_tables_ids.append(table_id)
                except:
                    pass
            
            if all(table_id not in page_tables_ids for table_id in table_ids):
                pass
            else:
                merged_df = pd.DataFrame()
                for table_id in table_ids:
                    table = soup.find(id=table_id)
                    df = pd.read_html(str(table), skiprows=1, header=0)[0]
                    df = process_df(df)
                    merged_df = pd.concat([merged_df, df])
                    
                merged_df.reset_index(drop=True, inplace=True)
                merged_df.to_csv(f'financial_data_two_tickers_case/{cik}-{stockid}_{ticker}.csv')
                
                success_count += 1
                pbar.set_description(f"Success: {success_count}")
    
        except Exception as ex:
            not_found_financial_data_with_exception.append([cik, stockid, ticker, str(ex)])

finally:
    driver.close()
    
    datetime_now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    companies_with_two_tickers_flatten.to_csv(f'companies_with_two_tickers_flatten_{datetime_now}.csv')

    with open(f'not_found_financial_data_with_exception_two_ticker_case_{datetime_now}.json', 'w') as file:
        json.dump(list(not_found_financial_data_with_exception), file)
        
    with open(f'not_found_financial_data_with_webdriver_timeout_exception_two_ticker_case_{datetime_now}.json', 'w') as file:
        json.dump(list(not_found_financial_data_with_webdriver_timeout_exception), file)

In [None]:
print(f"Number of companies with not found financial data with exception: {len(not_found_financial_data_with_exception)}")
print(f"Number of companies with not found financial data with webdriver timeout exception: {len(not_found_financial_data_with_webdriver_timeout_exception)}")