In [2]:
import pandas as pd
import numpy as np
import time
import re
import souper_v2
from tqdm import tqdm_notebook
from torrequest import TorRequest
from bs4 import BeautifulSoup
from retrying import retry

In [3]:
sampled_players = pd.read_csv('/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Final_datasets/final_list_of_sampled_players.csv')

In [4]:
sp = souper_v2.souper()

In [5]:
@retry(wait_fixed=3000)
def get_soup(URL):
    with TorRequest() as tr:
        tr.reset_identity()
        headers = {"User-Agent":'Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.7'}
        r = tr.get(URL, headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')
        return soup

In [11]:
sample_1 = sampled_players.iloc[0:179]
sample_2 = sampled_players.iloc[180:359]
sample_3 = sampled_players.iloc[360:539]
sample_4 = sampled_players.iloc[540:719]
sample_5 = sampled_players.iloc[720:899]
sample_6 = sampled_players.iloc[900:1079]
sample_7 = sampled_players.iloc[1080:]
sample_8 = sampled_players.iloc[[179,359,539,719,899,1079]]

## Whoscored scrapping

**Obtaining players basic information**

In [7]:
sampled_players.head()

Unnamed: 0,player,lastname,team,link_tm,link_ws
0,Max Clark,Clark,Vitesse,/max-clark/profil/spieler/183291,/Players/313996/Show/Max-Clark
1,Sergio Postigo,Postigo,Levante,/sergio-postigo/profil/spieler/158791,/Players/109896/Show/Sergio-Postigo
2,Stanislav Iljutcenko,Iljutcenko,Duisburg,/stanislav-iljutcenko/profil/spieler/191292,/Players/132901/Show/Stanislav-Iljutcenko
3,Iván Marcone,Marcone,Boca Juniors,/ivan-marcone/profil/spieler/90451,/Players/125246/Show/Iván-Marcone
4,Florent Mollet,Mollet,Montpellier,/florent-mollet/profil/spieler/222859,/Players/114169/Show/Florent-Mollet


In [8]:
def extract_player_info_block(players_df):
    
    links = players_df.link_ws
    teams = players_df.team
    
    player_info = {'name': [],
                  'position': [],
                  'height': [],
                  'weight': [],
                  'team': []} 

    for each_link, each_team in tqdm_notebook(zip(links,teams)):
        URL = 'https://www.whoscored.com' + each_link
        print(URL)
        soup = sp.get_soup(URL)
        text = ""
        for each in soup.find_all('dl', attrs={'class':'player-info-block'}):
            try:
                text = text + str(each)
            except:
                None
        
        name = re.findall(r'((?<=Name:</dt>\n<dd>).+(?=</dd>\n</dl><dl class))', text)
        height = re.findall(r'((?<=Height:</dt>\n<dd>).+(?=cm</dd>))', text)
        position = re.findall(r'((?<=Positions:</dt>\n<dd>\n<ul>\n<li>).*(?= |</li>))', text)
        weight = re.findall(r'((?<=Weight:</dt>\n<dd>).+(?=kg))', text)
        
        player_info['name'].append(name[0]) if len(name) >0 else player_info['name'].append(np.nan)
        player_info['position'].append(position[0].strip()) if len(position) >0 else player_info['position'].append(np.nan)
        player_info['height'].append(height[0]) if len(height) >0 else player_info['height'].append(np.nan)
        player_info['weight'].append(weight[0]) if len(weight) >0 else player_info['weight'].append(np.nan)
        player_info['team'].append(each_team)

    player_info_df = pd.DataFrame(player_info)
    
    return player_info_df

### Obtaining data by parts

In [None]:
sample_data_1 = extract_player_info_block(sample_1)

In [None]:
sample_data_2 = extract_player_info_block(sample_2)

In [None]:
sample_data_3 = extract_player_info_block(sample_3)

In [None]:
sample_data_4 = extract_player_info_block(sample_4)

In [None]:
sample_data_5 = extract_player_info_block(sample_5)

In [34]:
df_1 = pd.DataFrame(sample_data_1)
df_2 = pd.DataFrame(sample_data_2)
df_3 = pd.DataFrame(sample_data_3)
df_4 = pd.DataFrame(sample_data_4)
df_5 = pd.DataFrame(sample_data_5)

In [35]:
df_1.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Back up files/Parts of statistics/Basico_info_1',index=False)
df_2.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Back up files/Parts of statistics/Basico_info_2',index=False)
df_3.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Back up files/Parts of statistics/Basico_info_3',index=False)
df_4.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Back up files/Parts of statistics/Basico_info_4',index=False)
df_5.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Back up files/Parts of statistics/Basico_info_5',index=False)

In [None]:
sample_data_6 = extract_player_info_block(sample_6)

In [None]:
sample_data_7 = extract_player_info_block(sample_7)

In [None]:
sample_data_8 = extract_player_info_block(sample_8)

In [14]:
df_6 = pd.DataFrame(sample_data_6)
df_7 = pd.DataFrame(sample_data_7)
df_8 = pd.DataFrame(sample_data_8)

In [15]:
df_6.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Back up files/Parts of statistics/Basic_info_6',index=False)
df_7.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Back up files/Parts of statistics/Basic_info_7',index=False)
df_8.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Back up files/Parts of statistics/Basic_info_8',index=False)

### Cleaning and saving

In [16]:
test_load = pd.read_csv('/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Final_datasets/Basic_info')

In [17]:
test_load.shape

(895, 5)

In [18]:
partial_final_dataset = pd.concat([test_load, df_6, df_7, df_8],axis=0)

In [19]:
partial_final_dataset.to_csv(r'/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Final_datasets/Basic_info',index=False)


In [20]:
partial_final_dataset.head()

Unnamed: 0,name,position,height,weight,team
0,Max Clark,Defender (Left),180,73,Vitesse
1,Sergio Postigo,Defender (Centre),184,78,Levante
2,Stanislav Iljutcenko,Forward,189,82,Duisburg
3,Iván Marcone,Defensive Midfielder (C),184,79,Boca Juniors
4,Florent Mollet,"Attacking Midfielder (Centre, Left)",174,67,Montpellier


In [21]:
test_load = pd.read_csv('/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/DSI8-lessons/projects/project-capstone/Moving_forward/Final_datasets/Basic_info')


In [22]:
test_load.shape

(1200, 5)