In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pdb
import time
import re

%matplotlib inline

from urllib.request import urlopen
from bs4 import BeautifulSoup
from requests import get

from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse as parse_date
import pdb
import pytz
utc=pytz.UTC

# ACTION PLAN

1)  Start with region link.  Get all regional capitals.  Examples:
https://www.tusclasesparticulares.com/profesores-ingles/madrid-capital.aspx
https://www.tusclasesparticulares.com/profesores-ingles/bilbao

2) GET REGION URLS.  From base URL use "get_region_urls" to get the individual urls of all pages for specific region.

3) GET PROFILE URLS FROM RESULTS PAGE ON EACH REGION URL.  Use get_profile_urls.  Input is region URL.  Output is dictionary with profiles URL, picture boolean, profile type.  

4) SCRAPE INDIVIDUAL PROFILE URLS.  Input is dictionary of url of profile (+ extra information).  Output is dataframe.  



In [2]:
def scrape_region(base_region_url, region):
    
    #2) GET REGION URLS.  l
    region_urls = get_region_urls(base_region_url)
    #list of region urls
    
    #3) GET PROFILE URLS FROM RESULTS PAGE ON EACH REGION URL.
    profile_dicts = []
    for results_page in region_urls:
        #get list of profile dictionaries for each results page, 
        #then concatenate so you have big list of dictionaries that you can turn into a DF
        profile_dicts.extend(get_profile_urls(results_page))
        
        #now you have list of profile dictionaries for ALL results page for region
    
    #4) SCRAPE INDIVIDUAL PROFILE URLS.  
    #input is list of ALL profile dicts for region.  Output should be oe dataframe that has all region data
    print('length of profile_dicts:')
    print(len(profile_dicts))
    list_of_scrapped_profiles = []
    for profile_dictionary in profile_dicts:
        print('scraping this profile URL:')
        print(profile_dictionary['profile_url'])
        print('from this url results page:')
        print(profile_dictionary['results_page_url'])
        print('---------------')
        
        #scrape profile URL and return dictionary with ALL info for profiles
        scrapped_profile = profile_extraction(profile_dictionary)
    
        #add this dictionary to list
        list_of_scrapped_profiles.append(scrapped_profile)
    pdb.set_trace()
    #turn list of dicts into dataframe, add date
    df = pd.DataFrame(list_of_scrapped_profiles)
    df['date_scraped'] = datetime.now(pytz.utc)
    df['base_region'] = region
    return df

In [3]:
#INPUT - list of dicts with profile URL (and profile_type, picture info, page rank)
#MIDDLE - calls scrapping to get dictionary, appends to list
#RETURN - dataframe of all scrapped urls

#example of input:
# {'profile_url': 'https://www.tusclasesparticulares.com/profesores/vizcaya/clases-ingles-ninos-adultos-bilbao-1932040',
#   'page_rank': 1,
#   'profe_status': 'trr aitem int',
#   'has_picture': True}

def scrape_profiles_by_region(profile_dicts):
    list_of_scrapped_profiles = []
    for profile_dictionary in profile_dicts:
        scrapped_profile = profile_extraction(profile_dictionary)
        list_of_scrapped_profiles.append(scrapped_profile)
        print('scraping this profile URL:')
        print(profile_dictionary['profile_url'])
        print('from this url results page:')
        print(profile_dictionary['results_page_url'])
        print('---------------')
#         print(profile)
#         print('NEXT ITERATION')
    df = pd.DataFrame(list_of_scrapped_profiles)
    df['date_scraped'] = datetime.now(pytz.utc)
    return pd.DataFrame(list_of_scrapped_profiles) 

In [4]:
#INPUT - single url of region page of profes region_url
    #EXAMPLE- ='https://www.tusclasesparticulares.com/profesores-ingles/bilbao?pagina=8'
#RETURN - dict url to status, need to properly process this

def get_profile_urls(region_url, html):
    
    #html = get(region_url)
    time.sleep(1)
    soup = BeautifulSoup(html.text, 'html.parser')
    #list of profile urls for give region page
    landing_page_dict_list=[]
    page_rank = 0
    for element in soup.find_all('div', {'class' : re.compile(r'^trr aitem')}):
        results_ad_status = ' '.join(element['class'])
        if element.find('span', {'class' : 'spr_parrilla fotoparr'}) is None:
            has_picture=True
        else:
            has_picture=False
        for link in element.find_all('div', {'class' : 'rightcontent'}):
            if link.a is not None:
                base_url = 'https://www.tusclasesparticulares.com'
                scrapped_url = link.a.get("href")
                profile_url = base_url + scrapped_url
                break
            else:
                profile_url = 'EMPTY'
                break
        page_rank += 1
        profe_dict = {
            'profile_url' : profile_url,
            'results_page_url' : region_url,
            'page_rank' : page_rank,
            'results_ad_status' : results_ad_status,
            'has_picture' : has_picture,
            'html' : html}
        print('PROFILE DICT JUST SCRAPED FROM RESULTS PAGE:')
        print(profe_dict)
        landing_page_dict_list.append(profe_dict)
    return landing_page_dict_list

In [10]:
#Extract data from each profile URL
import socket


def url_open_for_profile_extraction(url, return_dict):
    html= urlopen(url)
    return_dict[html] = html

def profile_extraction(profile_dictionary):
    url = profile_dictionary['profile_url']    

    try:

        html = urlopen(url)
    except (TimeoutError, socket.timeout) as e:
        print('TimeoutError or socket.timeout')
    time.sleep(1)
    try:
        soup = BeautifulSoup(html, 'lxml')
        alternative_scraping = False
    except timeout:
        print('timeout: The read operation timed out')
        pdb.set_trace()
    #TEACHER NAME and RATING COUNT
    name_element= soup.find_all('div', {'id' : 'detsubheader'})
    if len(name_element) == 0:
        #use alternative scraping method for everything
        alternative_scraping = True
        #overwrite soup with get(url) method
        html = get(url)
        soup = BeautifulSoup(html.text, 'html.parser')
        name_element= soup.find_all('div', {'id' : 'detsubheader'})
    for item in name_element:
        teacher_name = item.find('a', {'id' : 'alnkname'})
        found_rating = item.find('span', {'itemprop' : 'ratingCount'})
        #check for prescence of this tag, means they are an academy
        if item.find('b', {'id' : 'bhome'}):
            is_academy = True
        else:
            is_academy = False
    if teacher_name is not None:
        #means link was found as teachers name, grab name and note that they have a profile page
        teacher_name = teacher_name.text.strip()
        teacher_has_profile_page = True
    else:
        # for case where teacher name is NOT link (teacher has no profile)
        name_element = soup.find('div', {'id' : 'detsubheader'})
        try:
            teacher_name = name_element.text.strip()
        except AttributeError:
            teacher_name = 'AttributeError: NoneType object has no attribute text'
        teacher_has_profile_page = False
    if found_rating is not None:
        rating_count = found_rating.text
    else:
        rating_count=0

    #these should not be needed....
    try:
       teacher_name
    except NameError:
        teacher_name = 'NameError'
    try:
        rating_count
    except NameError:
        rating_count = 'NameError'    
    
    #TITLE
    if len(soup.find_all('div', {'class' : 'detinfotit'})) == 0:
        ad_title = 'COULD NOT SCRAPE TITLE'
        #add code to try alternative scraping method
    else:
        title_element = soup.find_all('div', {'class' : 'detinfotit'})[0]
        ad_title = title_element.text.strip()
    try:
       ad_title
    except NameError:
        ad_title = 'EMPTY'
    
    #TEACHING SUBJECT
    #example:  Inglés
    for item in soup.find_all('p', {'id' : 'pClasesde'}):
        teaching_subject = item.text.replace("\n","").replace("\r","").split()[2]
    try:
       teaching_subject
    except NameError:
        teaching_subject = 'EMPTY'

    #GEO LOCATION
    for item in soup.find_all('p', {'id' : 'pProvincia'}):
        province = ' '.join(item.text.split()[1:])
    try:
       province
    except NameError:
        province = 'EMPTY'

    #CLASS LEVEL
    for item in soup.find_all('div', {'id' : 'dvPara'}):
        class_level_para = ' '.join(item.text.split()[3:]) 
    #check if variable was assigned, if it wasn't it's Null
    try:
       class_level_para
    except NameError:
        class_level_para = 'EMPTY'
    #CLASS LEVEL SECOND METHOD
    for item in soup.find_all('div', {'id' : 'dvNiveles'}):
        class_level_niveles = ' '.join(item.text.split()[1:])
    try:
       class_level_niveles
    except NameError:
        class_level_niveles = 'EMPTY'

    #METHOD
    for item in soup.find_all('div', {'id' : 'dvMetodos'}):
        method = item.text.split('\r')[1].strip()
    try:
       method
    except NameError:
        method = 'EMPTY'

    #PRICE
    for item in soup.find_all('div', {'id' : 'dvPrecio'}):
        price = item.text.replace("\n","").replace("\r","")
    try:
       price
    except NameError:
        price = 'EMPTY'
            
    #DESCRIPTION
    for item in soup.find_all('div', {'class' : 'detcntsection c5'}):  
        description = item.text.strip()
    try:
       description
    except NameError:
        description = 'EMPTY'
        
    #PROFE AD STATUS - BASIC, VERIFIED, PLUS
    element = soup.find('p', {'class' : 'mgbottom5 fs16 bold'})
    if element is None:
        profe_ad_status = 'EMPTY'
    else:
        profe_ad_status = element.text
        
    #CREATE DICT WITH ALL ENTRIES AND RETURN DICTIONAR
    scraped_profile={
        'url': url,
        'ad_title' : ad_title,
        'teaching_subject' : teaching_subject,
        'province' : province,
        'class_level_para' : class_level_para,
        'class_level_niveles' : class_level_niveles,
        'method' : method,
        'price' : price,
        'teacher_name' : teacher_name,
        'rating_count' : rating_count,
        'profe_ad_status' : profe_ad_status,
        'description' : description,
        #add in data passed from results page
        'page_rank' : profile_dictionary['page_rank'],
        'results_ad_status' : profile_dictionary['results_ad_status'],
        'has_picture' : profile_dictionary['has_picture'],
        'results_page_url' : profile_dictionary['results_page_url'],
        'alternative_scraping' : alternative_scraping,
        'teacher_has_profile_page' : teacher_has_profile_page,
        'is_academy' : is_academy,
        'html' : html
       }
    print('RETURNING PROFILE DATA')
    return scraped_profile



    

In [None]:
#SCRAPING AUTONOMOUS CAPITALS ONE BY ONE
import socket

start_time = time.time()

region_lookup = {'soria' : 'https://www.tusclasesparticulares.com/profesores-ingles/soria.aspx',
                 'teruel': 'https://www.tusclasesparticulares.com/profesores-ingles/Teruel.aspx',
                 'segovia' : 'https://www.tusclasesparticulares.com/profesores-ingles/segovia.aspx',
                 'huesca' : 'https://www.tusclasesparticulares.com/profesores-ingles/huesca.aspx',
                 'cuenca' : 'https://www.tusclasesparticulares.com/profesores-ingles/cuenca.aspx',
                 'avila' : 'https://www.tusclasesparticulares.com/profesores-ingles/avila.aspx',
                 'merida' : 'https://www.tusclasesparticulares.com/profesores-ingles/Merida.aspx',
                 'zamora' : 'https://www.tusclasesparticulares.com/profesores-ingles/zamora.aspx',

                 'ciudad_real' : 'https://www.tusclasesparticulares.com/profesores-ingles/ciudad-real.aspx',
                 'palencia' : 'https://www.tusclasesparticulares.com/profesores-ingles/palencia.aspx',
                 'pontevedra' : 'https://www.tusclasesparticulares.com/profesores-ingles/pontevedra.aspx',
                 
                 'toledo' : 'https://www.tusclasesparticulares.com/profesores-ingles/toledo.aspx',
                 'guadalajara' : 'https://www.tusclasesparticulares.com/profesores-ingles/guadalajara.aspx',
                 'ceuta' : 'https://www.tusclasesparticulares.com/profesores-ingles/ceuta.aspx',#13
                 
                 'melilla' : 'https://www.tusclasesparticulares.com/profesores-ingles/Melilla.aspx',#14
                 'caceres' : 'https://www.tusclasesparticulares.com/profesores-ingles/caceres.aspx',
                 'santiago_de_compostela' : 'https://www.tusclasesparticulares.com/profesores-ingles/santiago-de-compostela.aspx', #16
                 'lugo' : 'https://www.tusclasesparticulares.com/profesores-ingles/lugo.aspx',
                 'gerona' : 'https://www.tusclasesparticulares.com/profesores-ingles/girona.aspx',
                 'orense' : 'https://www.tusclasesparticulares.com/profesores-ingles/orense.aspx',
                 'jaen' : 'https://www.tusclasesparticulares.com/profesores-ingles/jaen.aspx',
                 
                 'cadiz' : 'https://www.tusclasesparticulares.com/profesores-ingles/cadiz.aspx',#21
                 'leon' : 'https://www.tusclasesparticulares.com/profesores-ingles/leon.aspx',
                 'tarragona' : 'https://www.tusclasesparticulares.com/profesores-ingles/tarragona.aspx',
                 'lerida' : 'https://www.tusclasesparticulares.com/profesores-ingles/lerida.aspx', #24
                 'salamanca' : 'https://www.tusclasesparticulares.com/profesores-ingles/salamanca.aspx', #25
                 'huelva' : 'https://www.tusclasesparticulares.com/profesores-ingles/huelva.aspx',
                 
                 'badajoz' : 'https://www.tusclasesparticulares.com/profesores-ingles/badajoz.aspx', #27
                 'logrono' : 'https://www.tusclasesparticulares.com/profesores-ingles/logrono.aspx',
                 'castellon_de_la_plana' : 'https://www.tusclasesparticulares.com/profesores-ingles/castellon-de-la-plana.aspx',
                 
                 'santander' : 'https://www.tusclasesparticulares.com/profesores-ingles/santander.aspx', #30
                 'albacete' : 'https://www.tusclasesparticulares.com/profesores-ingles/albacete.aspx',#31
                 'burgos' : 'https://www.tusclasesparticulares.com/profesores-ingles/burgos.aspx',
                 
                 'san_sebastian' : 'https://www.tusclasesparticulares.com/profesores-ingles/san-sebastian.aspx', #33
                 'almeria' : 'https://www.tusclasesparticulares.com/profesores-ingles/almeria.aspx',
                 'pamplona' : 'https://www.tusclasesparticulares.com/profesores-ingles/pamplona.aspx',
                 
                 'tenerife' : 'https://www.tusclasesparticulares.com/profesores-ingles/santa-cruz-de-tenerife.aspx', #36
                 'oviedo' : 'https://www.tusclasesparticulares.com/profesores-ingles/oviedo.aspx',
                 'granada' : 'https://www.tusclasesparticulares.com/profesores-ingles/granada.aspx', #38
                 'la_coruna' : 'https://www.tusclasesparticulares.com/profesores-ingles/a-coruna.aspx', #39
                 'vitoria' : 'https://www.tusclasesparticulares.com/profesores-ingles/vitoria.aspx',
                 'valladolid' : 'https://www.tusclasesparticulares.com/profesores-ingles/valladolid.aspx',
                 'cordoba' : 'https://www.tusclasesparticulares.com/profesores-ingles/cordoba.aspx', #42
                 
                 'alicante' : 'https://www.tusclasesparticulares.com/profesores-ingles/alicante.aspx', #43
                 'bilbao' : 'https://www.tusclasesparticulares.com/profesores-ingles/bilbao.aspx',
                 'las_palmas_de_gran_canaria' : 'https://www.tusclasesparticulares.com/profesores-ingles/las-palmas-de-gran-canaria.aspx',
                 'palma' : 'https://www.tusclasesparticulares.com/profesores-ingles/palma-de-mallorca.aspx',
                 'murcia' : 'https://www.tusclasesparticulares.com/profesores-ingles/murcia.aspx',
                 
                 'malaga' : 'https://www.tusclasesparticulares.com/profesores-ingles/malaga.aspx', #48
                 'zaragoza' : 'https://www.tusclasesparticulares.com/profesores-ingles/zaragoza.aspx', #49 
                 'sevilla' : 'https://www.tusclasesparticulares.com/profesores-ingles/sevilla.aspx',
                 'valencia' : 'https://www.tusclasesparticulares.com/profesores-ingles/valencia.aspx', #51
                 'barcelona' : 'https://www.tusclasesparticulares.com/profesores-ingles/barcelona.aspx',
                 'madrid' : 'https://www.tusclasesparticulares.com/profesores-ingles/madrid.aspx                
                }

try:
   df_created_list
except NameError:
    df_created_list = []

end_loop = len(region_lookup) 
#specify region to start in: 0 = first in list (soria)
region_start = 0 

for i in range(region_start, end_loop):
    print('i is: ' + str(i))

    region_name = list(region_lookup.keys())[i]
    base_region_url = region_lookup[region_name]
    print('region_name is ' + region_name)
    print('base_region_url is ' + base_region_url)
    
    results_page_profile_dicts = []
    counter = 1
    current_page = base_region_url

    #infinite loop and break out once pagination link false
    while True:
        html = get(current_page)
        if html.url == current_page:
            #If so, continue, pagination exists.  
            #CONTINUE WITH SCRAPING, 
            #dictionary with each profile and additional data from results page
            results_page_profile_dicts.extend(get_profile_urls(current_page, html))
            counter +=1
            current_page = base_region_url + '?pagina=' + str(counter)
        #If not, no more pagination exists
        else:
            break
            #finish results page scraping

    list_of_scrapped_profiles = []
    bad_url_profiles = []
    try: 
        for profile_dictionary in results_page_profile_dicts:
            
            print('scraping this profile URL:')
            print(profile_dictionary['profile_url'])
            print('from this url results page:')
            print(profile_dictionary['results_page_url'])
            print('---------------')


            scrapped_profile = profile_extraction(profile_dictionary)
            list_of_scrapped_profiles.append(scrapped_profile)
 
    except (UnboundLocalError, KeyboardInterrupt) as e:
        print('buttnast')
        continue
        pdb.set_trace()

    #build_DF
    temp_df = pd.DataFrame(list_of_scrapped_profiles)
    temp_df['date_scraped'] = datetime.now(pytz.utc)
    temp_df['base_region'] = region_name

    vars()[region_name] = temp_df
    print('df created for region of: ' + region_name)
    df_created_list.append(region_name)

    elapsed_time = time.time() - start_time
    print('elapsed time is:')
    print(elapsed_time/60)

print('  madrid')


In [110]:
#IF SCRAPING ISSUE, USE TO CONTINUE PROCESS AT GIVEN #

print(len(results_page_profile_dicts))
print('start scraping at:')
print(len(list_of_scrapped_profiles))
type(results_page_profile_dicts)

10000
start scraping at:
9996


list

In [None]:
#CONTINUE LOOP FROM MAIN SCRAPING PROCESS, based on N in previous cell

from urllib.error import HTTPError

for profile_dictionary in results_page_profile_dicts[8374:]:

            print('scraping this profile URL:')
            print(profile_dictionary['profile_url'])
            print('from this url results page:')
            print(profile_dictionary['results_page_url'])
            print('---------------')
            try:
                scrapped_profile = profile_extraction(profile_dictionary)
            except (HTTPError, UnboundLocalError) as e:
                continue
            list_of_scrapped_profiles.append(scrapped_profile)


In [112]:
#create df based on region name and write out data.  save created DF's to list. 
temp_df = pd.DataFrame(list_of_scrapped_profiles)
temp_df['date_scraped'] = datetime.now(pytz.utc)
temp_df['base_region'] = region_name

vars()[region_name] = temp_df
print('df created for region of: ' + region_name)

df created for region of: madrid


In [3]:
list_of_scrapped_profiles

NameError: name 'list_of_scrapped_profiles' is not defined

In [114]:
#WRITE OUT REGIONAL DATAFRAMES to CSV

by_region_dfs_concat = pd.concat([
#these are saved as 'by_region_dfs.csv'
#soria,
# segovia,
# huesca,
# cuenca,
# avila,
# merida,
# zamora,
# ciudad_real, 
# palencia, 
# pontevedra,
# toledo,
# guadalajara,
# ceuta,
# melilla,
# caceres,
# santiago_de_compostela,
# lugo,
# gerona,
# orense,
# jaen,
# cadiz,
# leon,
# tarragona
# lerida,
# salamanca,
# huelva,
# badajoz, logrono, castellon_de_la_plana,
# santander,
# albacete, burgos,
# san_sebastian, almeria, pamplona,
# tenerife, oviedo,
# granada,
# la_coruna, vitoria, valladolid,
# cordoba,
# alicante, bilbao, las_palmas_de_gran_canaria, palma, murcia,
malaga,
zaragoza,
sevilla,
valencia,
barcelona,
madrid

])
by_region_dfs_concat.to_csv('by_region_dfs_MALAGA_to_MADRID.csv')