In [1]:
import requests

In [2]:
import logging
import requests
import json
import random
import os

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

logging.basicConfig()
logger = logging.getLogger('fetch')


def random_user_agent():
    """Create a random user agent to be used for getting html content
    """
    user_agent_list = [
        #Chrome
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
        'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        #Firefox
        'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
        'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
        'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
    ]

    return {'User-Agent': random.choice(user_agent_list)}




In [3]:
def get_page_html(
    link,
    retry_params=None,
    headers=None,
    timeout=None,
    session=None
    ):
    """Download html page and save content
    """

    if retry_params is None:
        retry_params = {}

    retry_params = {
        **{
            'retries': 5,
            'backoff_factor': 0.3,
            'status_forcelist': (500, 502, 504)
        },
        **retry_params
    }

    if headers is None:
        headers = random_user_agent()

    if timeout is None:
        timeout = (5, 14)

    if session is None:
        session = requests.Session()

    if proxies is None:
        proxies = {}

    retry = Retry(
        total=retry_params.get('retries'),
        read=retry_params.get('retries'),
        connect=retry_params.get('retries'),
        backoff_factor=retry_params.get('backoff_factor'),
        status_forcelist=retry_params.get('status_forcelist'),
    )

    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    page = session.get(link, headers=headers)

    status = page.status_code

    return {'status': status, 'page': page}

In [None]:
class XingProfileHTMLExtractor():
    """Extract information from Xing profile html data
    """
    def __init__(self, page, xpath_criteria=None):
        self.page = page
        self.source = 'xing'
        self.xpath_criteria = xpath_criteria

    @staticmethod
    def _get_xing_company_fact(soup_facts):
        """parse xing company fact table

        Parameters
        ----------
        soup_section
            BeautifulSoup that contains the facts

        Returns
        --------
        dict
            dictionary of company facts:
            employees, branch, product_and_services, founded
        """
        res_label = []
        res_value = []

        for i in soup_facts.find_all(['dt', 'dd']):
            if i.name == 'dt':
                res_label.append(i.text)
            elif i.name == 'dd':
                res_value.append(i.text)

        res = dict(zip(res_label, res_value))

        rename = {
            'Unternehmensgröße': 'profile__info__employees',
            'Branche': 'profile__info__branch',
            'Produkte und Services': 'profile__info__product_and_services',
            'Gründungsjahr': 'profile__info__founded',
            'Employees': 'profile__info__employees',
            'Industry': 'profile__info__branch',
            'Products and services': 'profile__info__product_and_services',
            'Year of establishment': 'profile__info__founded'
            }

        for key, val in rename.items():
            if res.get(key):
                res[val] = res.pop(key)

        if res.get('profile__info__employees'):
            res['profile__info__employees'] = res.get(
                'profile__info__employees'
                ).replace(
                    ' Mitarbeiter', ''
                    ).replace(' employees', '').strip()
        else:
            res['profile__info__employees'] = None

        return res

    @staticmethod
    def _get_xing_company_about(soup_about):
        """Extract the about us from the about us soup
        """
        if soup_about:
            about_us = soup_about.text.replace(
                '\n',
                ' '
            ).replace(
                '\r',
                ' '
            ).replace(
                '\t',
                ' '
            ).strip()
        else:
            about_us = ''

        return {
            'profile__info__about': about_us
        }

    @staticmethod
    def _get_xing_company_contact(soup_contact):

        res = {}

        div_contact__map = soup_contact.find_all(
            'div', attrs={'class': 'xing-map'}
            )
        div_contact__streetAddress = soup_contact.find_all(
            'div', attrs={'itemprop': 'streetAddress'}
            )
        div_contact__postalCode = soup_contact.find_all(
            'span', attrs={'itemprop': 'postalCode'}
            )
        div_contact__addressLocality = soup_contact.find_all(
            'span', attrs={'itemprop': 'addressLocality'}
            )
        div_contact__addressCountry = soup_contact.find_all(
            'div', attrs={'itemprop': 'addressCountry'}
            )
        div_contact__phone = soup_contact.find_all(
            'span', attrs={'itemprop': 'telephone'}
            )
        div_contact__faxNumber = soup_contact.find_all(
            'span', attrs={'itemprop': 'faxNumber'}
            )
        div_contact__url = soup_contact.find_all(
            'a', attrs={
                'itemprop': 'url',
                'data-tracking-event-name': 'PropOutboundLinkUrl'
                }
            )

        profile__contact__country = div_contact__addressCountry[0].text \
            if div_contact__addressCountry else None

        if profile__contact__country:
            profile__contact__country = _standard_country_name(
                profile__contact__country
                )

        profile__contact__locality = div_contact__addressLocality[0].text \
            if div_contact__addressLocality else None

        if profile__contact__locality:
            profile__contact__locality = _standard_city_name(
                profile__contact__locality,
                country_code=profile__contact__country
                )

        res = {
            'profile__contact__street_address': div_contact__streetAddress[0].text
                                                    if div_contact__streetAddress
                                                    else None,
            'profile__contact__postal_code': div_contact__postalCode[0].text
                                                if div_contact__postalCode
                                                else None,
            'profile__contact__locality': profile__contact__locality,
            'profile__contact__country': profile__contact__country,
            'profile__contact__phone': div_contact__phone[0].text
                                            if div_contact__phone
                                            else None,
            'profile__contact__url': div_contact__url[0].get('href')
                                            if div_contact__url
                                            else None,
            'profile__contact__fax': div_contact__faxNumber[0].text
                                        if div_contact__faxNumber
                                        else None
        }

        return res

    def get_company_profile(self):
        """Finds the company facts from html of the xing page
        """

        page = self.page

        soup = BeautifulSoup(page.content, features='html.parser')
        ## Extract the section that contains the facts of the company
        section_fact = soup.find_all(
            'section', attrs={
                'class': 'facts foundation-col-tablet-50 foundation-col-phone-100'
                }
            )
        ## Extract the div that contains the "about us" of the company
        div_about = soup.find_all('div', attrs={'id': 'about-us-content'})
        div_contact = soup.find_all('div', attrs={'id': 'contact-info'})

        company_profile_fact = {}
        company_profile_about = {}
        company_profile_contact = {}

        if section_fact:
            company_profile_fact = self._get_xing_company_fact(section_fact[0])

        if div_about:
            company_profile_about = self._get_xing_company_about(div_about[0])

        if div_contact:
            company_profile_contact = self._get_xing_company_contact(
                div_contact[0]
            )

        self.compnany_profile = {
            **company_profile_fact,
            **company_profile_about,
            **company_profile_contact
        }

        return self.compnany_profile


In [4]:
import zipfile

In [5]:
with zipfile.ZipFile("/Users/leima/Downloads/indego-trips-2019-q1.csv.zip","r") as zip_ref:
    zip_ref.extractall("targetdir")

In [7]:
"/Users/leima/Downloads/indego-trips-2019-q1.csv.zip"[:-4]

'/Users/leima/Downloads/indego-trips-2019-q1.csv'

In [11]:
for i in range(10):
    if i%2==0:
        pass
    print(i)

0
1
2
3
4
5
6
7
8
9


In [14]:
import urllib

In [15]:
urllib.request.urlretrieve(
    'https://u626n26h74f16ig1p3pt0f2g-wpengine.netdna-ssl.com/wp-content/uploads/2019/04/indego-trips-2019-q1.csv.zip', '/tmp/zip.zip')

HTTPError: HTTP Error 403: Forbidden