In [28]:
import requests
import pandas as pd
import numpy as np
import time
import re
import souper_v2
from tqdm import tqdm_notebook
from torrequest import TorRequest
from bs4 import BeautifulSoup
from retrying import retry

In [11]:
sp = souper_v2.souper()

In [4]:
@retry(wait_fixed=3000)
def get_soup(URL):
    with TorRequest() as tr:
        tr.reset_identity()
        headers = {"User-Agent":'Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.7'}
        r = tr.get(URL, headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')
        return soup

## Getting teams ranking

In [24]:
def extract_ranking_from_result(soup):
    ranking = []
    for each in soup.find_all('div', attrs={'class':'rank rankingRight'}):
        try:
            ranking.append(each.text.strip())
        except:
            ranking.append(np.nan)
    return ranking

In [25]:
def extract_name_from_result(soup):
    name = []
    for each in soup.find_all('div', attrs={'class':'col-name lineheigt'}):
        try:
            name.append(each.text.strip())
        except:
            name.append(np.nan)
    return name

In [26]:
def extract_country_from_result(soup):
    country = []
    for each in soup.find_all('div', attrs={'class':'nat RankingRight'}):
        try:
            country_clean = re.findall(r'((?<=title=").+(?="></i>))', str(each))
            country_three_letters = country_clean[0][0:3].upper()
            country.append(country_three_letters)
        except:
            country.append(np.nan)
    return country

In [29]:
results = {'Ranking':[],
           'Team':[],
           'Country':[]} 
    
for start in tqdm_notebook(range(0, 200, 25)):
    url = "https://www.clubworldranking.com/ranking-clubs?wd=18&yr=2019&index={}".format(start)
    r = requests.get(url)
    soup = BeautifulSoup(r.text,'html.parser')
    results['Ranking'] += extract_ranking_from_result(soup)
    results['Team'] += extract_name_from_result(soup)
    results['Country'] += extract_country_from_result(soup)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




In [31]:
ranking_df = pd.DataFrame(results)

In [32]:
ranking_df.head()

Unnamed: 0,Ranking,Team,Country
0,1,Barcelona,SPA
1,2,Real Madrid,SPA
2,3,River Plate,ARG
3,4,Manchester City,ENG
4,5,Atlético Madrid,SPA


In [33]:
ranking_df.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Final_datasets/final_clubs_teams_ranking.csv',index=False)

loading_test_clubs = pd.read_csv('/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Final_datasets/final_clubs_teams_ranking.csv')

loading_test_clubs.head()

Unnamed: 0,Ranking,Team,Country
0,1,Barcelona,SPA
1,2,Real Madrid,SPA
2,3,River Plate,ARG
3,4,Manchester City,ENG
4,5,Atlético Madrid,SPA


## Getting national teams ranking

In [34]:
sp = souper.souper()

In [12]:
def extract_national_teams_ranking():
    
    ranking = {'Country': [], 'Position': []}
    
    for start in tqdm_notebook(range(1, 8, 1)):
        url = "https://www.transfermarkt.co.uk/statistik/weltrangliste?page={}".format(start)
        soup = sp.get_soup(url)

        for each in soup.find_all('a', attrs={'class':'vereinprofil_tooltip tooltipstered'}):
            try:
                if len(each.text.strip()) > 0:
                    ranking['Country'].append(each.text.strip())
                    ranking['Position'].append(len(ranking['Country']))
            except:
                ranking['Country'].append(np.nan)
                ranking['Position'].append(np.nan)
                
    return ranking

In [None]:
test_nat = extract_national_teams_ranking()

In [17]:
ranking_nat_df = pd.DataFrame(test_nat)

In [19]:
test_nat_df.head()

Unnamed: 0,Country,Position
0,Belgium,1
1,France,2
2,Brazil,3
3,England,4
4,Croatia,5


In [21]:
test_nat_df.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Final_datasets/final_nat_teams_ranking.csv',index=False)


In [22]:
loading_test = pd.read_csv('/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Final_datasets/final_nat_teams_ranking.csv')

In [23]:
loading_test.head()

Unnamed: 0,Country,Position
0,Belgium,1
1,France,2
2,Brazil,3
3,England,4
4,Croatia,5
