# Sample Source Code

## Pandas HTML + BeautifulSoup

+600 web requests

In [None]:
!pip install requests-random-user-agent

In [None]:
import multiprocessing
multiprocessing.cpu_count()

In [6]:

import requests
import requests_random_user_agent
#s = requests.Session()
#print(s.headers['User-Agent'])

# Without a session
resp = requests.get('https://httpbin.org/user-agent')
print(resp.json()['user-agent'])

Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36


In [7]:
import concurrent.futures
import requests
import requests_random_user_agent
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

def download_sourceCode_url(url):

  rq = requests.get(url)
  
  if rq.status_code == 404 or rq.status_code == 403: ## Handle more error codes...
    exit

  main_data = rq.text

  # dataset from table
  df_tempSourceCode = pd.read_html(main_data, index_col=0)[0]

  main_soup = BeautifulSoup(main_data, 'html.parser')
  main_names = main_soup.find_all('tr')[1:245]

  list_urlSourceCode = []

  # Head url for meta_url
  head_Url= 'https://www.programmableweb.com'

  for row in main_names:
      text = row.find_all('td')[0]
      list_urlSourceCode.append( (head_Url + str(text).partition('<a href="')[2].partition('">')[0]))

  df_tempSourceCode['Meta_Url'] = list_urlSourceCode

  return df_tempSourceCode


def download_sourceCode_bulk_url(story_urls):
  df_temp = pd.DataFrame()

  # Partimos las url
  lst_splited  = np.array_split(story_urls, 100) # max workers

  tasks = []

  for split in range(len(lst_splited)):
    with concurrent.futures.ThreadPoolExecutor(max_workers = len(lst_splited)) as executor:
      for url in lst_splited[split]:    
        tasks.append(executor.submit(download_sourceCode_url, url))

  for result in tasks:
    df_temp = df_temp.append(result.result())

  return df_temp


In [None]:

sourceCode_urls=[]
for i in range(200): ## web pages?.. 615
    main_url = 'https://www.programmableweb.com/category/all/sample-source-code?page=' + str(i) ## parametrizar + comprobar que hay datos
    sourceCode_urls.append(main_url)

df_sourceCode = pd.DataFrame()
df_sourceCode = download_sourceCode_bulk_url(sourceCode_urls)
df_sourceCode

In [None]:
df_sourceCode[df_sourceCode.duplicated(keep=False)]

## Meta URL Processing

+15k web requests -> split?

TODO> threading


In [None]:
# Creates new columns
df_sourceCode['Source Code'] = ""
df_sourceCode['Repository'] = ""
df_sourceCode['Languages'] = ""

In [None]:
for i in range(len(df_sourceCode)):

    meta_url = df_sourceCode['Meta_Url'][i]
    rq = requests.get(meta_url)

    if rq.status_code == 404 or rq.status_code == 403: ## Handle more error codes...
      continue

    meta_data = rq.text
    meta_soup = BeautifulSoup(meta_data, 'html.parser')

    # Update Description from the meta url
    meta_description = str(meta_soup.find('div', class_='tabs-header_description')).partition('">')[2].partition('</')[0]
    df_sourceCode['Description'][i] = meta_description 

    meta_specs = meta_soup.find('div', class_='section specs')

    for lab in meta_specs.select("label"):   

      # Search for Repo
      if (lab.text.lower().find("repository") > -1):
          #print(lab.text + ": " + lab.find_next_sibling().text)
          df_sourceCode['Repository'][i] =   lab.find_next_sibling().text

       # Search for Source Code
      if (lab.text.lower().find("link to source code") > -1):
          #print(lab.text + ": " + lab.find_next_sibling().text)
          df_sourceCode['Source Code'][i] =   lab.find_next_sibling().text

      # Search for Categories and remplace them
      if (lab.text.lower().find("categories") > -1):
         #print(lab.text + ": " + lab.find_next_sibling().text)
          df_sourceCode['Category'][i] =   lab.find_next_sibling().text

      # Search for Languages
      if (lab.text.lower().find("languages") > -1):
          #print(lab.text + ": " + lab.find_next_sibling().text)
          df_sourceCode['Languages'][i] =   lab.find_next_sibling().text


In [None]:
# Meta_Url could be used to check for updates on the source website. That uses only +600 web requests instead of +15k
# save a copy of the original dataframe to check for updates based on the meta url or other fields
df_sourceCode.reset_index(inplace=True)
df_export_sourceCode = df_sourceCode.copy()

In [None]:
# Drop the column for the data analysis ( ? ? )
df_export_sourceCode.drop('Meta_Url', inplace=True, axis=1)
df_export_sourceCode

## Export

In [None]:
from datetime import datetime
datetime = datetime.now()

## Export the data for analysis 
# To CSV (index True 0,1,2...)
df_export_sourceCode.to_csv(r'/content/DataFrame/sample_source_code_' + datetime.now().strftime('%d_%m_%Y') + '.csv', index = True, header = True)

# To JSON (columns format index True 0,1,2...)
df_export_sourceCode.to_json(r'/content/DataFrame/sample_source_code_' + datetime.now().strftime('%d_%m_%Y') + '.json')


## Export the original + Meta_Url

# To CSV (index True 0,1,2...)
df_sourceCode.to_csv(r'/content/DataFrame/original_sample_source_code_' + datetime.now().strftime('%d_%m_%Y') + '.csv', index = True, header = True)

# To JSON (columns format index True 0,1,2...)
df_sourceCode.to_json(r'/content/DataFrame/original_sample_source_code_' + datetime.now().strftime('%d_%m_%Y') + '.json')