In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

# custom functions for pulling
from db_functions import *

pd.set_option('display.max_columns', None)

# Player Database

We start by loading our data and removing some variables we don't need. 

In [2]:
df = pd.read_csv('FIFA19 - Ultimate Team players.csv', low_memory = False)
cols_drop = ['origin', 'player_extended_name', 'price_xbox', 'price_pc', 'specialties', 'cb', 'lb', 'rb', 'rwb', 'lwb', 'cdm', 'cm', 'cam', 'lm', 'lw', 'lf', 'rm', 'rw', 'rf', 'cf', 'st', 
                   'price_ps4', 'gk_positoning', 'gk_kicking', 'gk_speed', 'gk_handling', 'gk_reflexes', 'gk_diving', 'date_of_birth', 'traits']
df.drop(cols_drop, axis = 1, inplace = True)
df = df[(df.quality == 'Gold - Rare') | (df.quality == 'Gold')]
df.reset_index(drop = True, inplace = True)
df['resource_id'] = np.nan
df['id'] = df.player_ID
df.set_index('id', inplace = True)

In [3]:
df.tail(2)

Unnamed: 0_level_0,player_ID,player_name,quality,revision,overall,club,league,nationality,position,age,height,weight,intl_rep,added_date,pace,pace_acceleration,pace_sprint_speed,dribbling,drib_agility,drib_balance,drib_reactions,drib_ball_control,drib_dribbling,drib_composure,shooting,shoot_positioning,shoot_finishing,shoot_shot_power,shoot_long_shots,shoot_volleys,shoot_penalties,passing,pass_vision,pass_crossing,pass_free_kick,pass_short,pass_long,pass_curve,defending,def_interceptions,def_heading,def_marking,def_stand_tackle,def_slid_tackle,physicality,phys_jumping,phys_stamina,phys_strength,phys_aggression,pref_foot,att_workrate,def_workrate,weak_foot,skill_moves,resource_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1
17020,17020,Gelson Martins,Gold - Rare,SBC,82,Atlético Madrid,LaLiga Santander,Portugal,RM,23,173,72,3,2018-10-05,94.0,95,94,86.0,94,92,83,83,85,85,69.0,82,66,72,69,74,66,75.0,79,81,60,78,55,82,47.0,43,51,54,46,36,61.0,65,87,48,60,Right,High,Med,3,5,
17021,17021,Verdi,Gold - Rare,OTW,81,Napoli,Serie A TIM,Italy,RW,26,171,61,2,2018-10-05,87.0,88,86,85.0,92,90,78,86,84,78,73.0,76,67,81,77,74,68,81.0,82,82,84,82,76,84,48.0,36,42,55,48,60,55.0,51,75,48,48,Left,High,Med,5,4,


In [4]:
print('As of now, we have {} players in our database and no prices.'.format(df.shape[0]))
print('Our data consists of {} features.'.format(df.shape[1]))

As of now, we have 2920 players in our database and no prices.
Our data consists of 55 features.


# Data Mining

We wish to add all the gold players (including special cards) along with daily prices. To do so, we need to do the following:
1. Scrape futbin and collect the resource id for every player listed above, and add that to our dataframe.
2. Build a script that adds all the new players that are not in our dataframe (collects all relevant data including the resource id). 
3. Collect the prices on all the players in our database using the resource id. Construct multiple observations for each player, one for each price-point. 


#### Step 1

To scrape the resource id, we will be using some functions defined below that use requests and beautiful soup to parse the html of each player's page on futbin.

In [6]:
df_p = df_fetch_resourceid(df)

Turns out, we also need the PGP data for each player, so we parse that too. 

In [7]:
df_p = df_p.groupby('player_ID').last()
df_p['num_games'] = np.nan
df_p['avg_goals'] = np.nan
df_p['avg_assists'] = np.nan

In [8]:
df_players = df_fetch_pgp(df_players)

#### Step 2

We have collected the resource_id for the players in our dataframe. To proceed, we're going to write some functions to collect data on players that are not in our dataframe and add them. 

In [9]:
df_p = df_fetch_newplayers(17503, df_p)

Before we move on to step 3, we should filter our constructed dataframe s.t we only keep the players we are actually interested in.
We remove:
- Goalkeepers
- Silver players
- Bronze players

There are various players that are not unavailable to be purchased, e.g. Flashback SBC players, but those will be dealt with later.

In [12]:
df_p = df_p[(df_p.quality == 'Gold - Rare') | (df_p.quality == 'Gold') | (df_p.quality == 'gold rare')]
df_p['revision'] = df_p.revision.fillna('Normal')
df_p = df_p[df_p.position != 'GK']

#### Player Database is Ready

We save the player database before proceeding to getting the prices for each player.

In [13]:
df_p.to_csv('player_database.csv')

We now fetch the prices for each player in our database. This process will probably take long, depending on the size of the player database. 

In [14]:
df_prices = df_fetch_price(df_p)

In [15]:
df_prices.to_csv('prices_database.csv')

# Dataframe Update

To update our dataframe when new players are added to the game, we do the following:
- Read in our player dataframe
- Scrape the data for all the new players
- Fetch their prices

In [10]:
df_p = pd.read_csv('data/player_database.csv', index_col='player_ID', parse_dates=['added_date'])
# df_p.drop('player_ID.1', inplace=True, axis=1)
df_p = df_fetch_newplayers(20310, df_p)

In [11]:
df_p.tail(2)

Unnamed: 0_level_0,player_name,quality,revision,overall,club,league,nationality,position,age,height,weight,intl_rep,added_date,pace,pace_acceleration,pace_sprint_speed,dribbling,drib_agility,drib_balance,drib_reactions,drib_ball_control,drib_dribbling,drib_composure,shooting,shoot_positioning,shoot_finishing,shoot_shot_power,shoot_long_shots,shoot_volleys,shoot_penalties,passing,pass_vision,pass_crossing,pass_free_kick,pass_short,pass_long,pass_curve,defending,def_interceptions,def_heading,def_marking,def_stand_tackle,def_slid_tackle,physicality,phys_jumping,phys_stamina,phys_strength,phys_aggression,pref_foot,att_workrate,def_workrate,weak_foot,skill_moves,resource_id,num_games,avg_goals,avg_assists
player_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
20309,Alexander Isak,gold rare,IF,77,Willem II,Eredivisie,Sweden,ST,19,190,74,1,2019-04-10,85.0,87,83,77.0,85,66,73,79,77,68,77.0,84,81,74,67,87,62,64.0,72,62,43,72,49,49,31.0,27,74,31,22,18,66.0,76,75,67,46,Right,Med,Med,3,3,67342595,-,-,-
20310,Robert Lewandowski,gold rare,TIF,94,FC Bayern München,Bundesliga,Poland,ST,30,184,80,4,2019-04-10,85.0,83,85,92.0,84,84,97,96,92,93,93.0,95,95,91,87,93,91,84.0,86,69,96,92,72,86,45.0,42,93,37,46,20,87.0,89,82,89,84,Right,High,Med,4,4,151183489,-,-,-


In [7]:
df_p = df_p[(df_p.quality == 'Gold - Rare') | (df_p.quality == 'Gold') | (df_p.quality == 'gold rare')]
df_p['revision'] = df_p.revision.fillna('Normal')
df_p = df_p[df_p.position != 'GK']
df_p.to_csv('data/player_database.csv')

In [12]:
df_prices = df_fetch_price(df_p)

Completed 1 players. Time elapsed: 0 seconds. Approx. 0 seconds left.
Completed 200 players. Time elapsed: 100 seconds. Approx. 1834 seconds left.
Completed 400 players. Time elapsed: 202 seconds. Approx. 1742 seconds left.
Completed 600 players. Time elapsed: 303 seconds. Approx. 1641 seconds left.
Completed 800 players. Time elapsed: 404 seconds. Approx. 1525 seconds left.
Completed 1000 players. Time elapsed: 508 seconds. Approx. 1482 seconds left.
Completed 1200 players. Time elapsed: 612 seconds. Approx. 1365 seconds left.
Completed 1400 players. Time elapsed: 714 seconds. Approx. 1250 seconds left.
Completed 1600 players. Time elapsed: 815 seconds. Approx. 1125 seconds left.
Completed 1800 players. Time elapsed: 917 seconds. Approx. 1046 seconds left.
Completed 2000 players. Time elapsed: 1022 seconds. Approx. 962 seconds left.
Completed 2200 players. Time elapsed: 1125 seconds. Approx. 842 seconds left.
Completed 2400 players. Time elapsed: 1227 seconds. Approx. 740 seconds left

In [14]:
df_prices.tail(2)

Unnamed: 0,player_name,quality,revision,overall,club,league,nationality,position,age,height,weight,intl_rep,added_date,pace,pace_acceleration,pace_sprint_speed,dribbling,drib_agility,drib_balance,drib_reactions,drib_ball_control,drib_dribbling,drib_composure,shooting,shoot_positioning,shoot_finishing,shoot_shot_power,shoot_long_shots,shoot_volleys,shoot_penalties,passing,pass_vision,pass_crossing,pass_free_kick,pass_short,pass_long,pass_curve,defending,def_interceptions,def_heading,def_marking,def_stand_tackle,def_slid_tackle,physicality,phys_jumping,phys_stamina,phys_strength,phys_aggression,pref_foot,att_workrate,def_workrate,weak_foot,skill_moves,resource_id,num_games,avg_goals,avg_assists,date,price
631539,Carlos Vela,gold rare,,88,Los Angeles Football Club,Major League Soccer,Mexico,RW,30,177,77,3,2019-04-09,88.0,86,89,89.0,84,84,84,90,91,86,89.0,91,91,82,90,92,81,88.0,92,90,81,92,72,90,39.0,45,85,37,26,17,74.0,77,76,76,64,Left,High,Low,2,4,117609928,12,0.25,0.08,2019-04-08,0
631540,Karim Benzema,gold rare,CL SBC,90,Real Madrid,LaLiga Santander,France,ST,31,185,81,4,2019-04-09,84.0,84,84,90.0,84,67,92,94,90,87,89.0,95,87,91,84,87,92,86.0,95,84,82,92,62,89,35.0,24,85,51,16,13,86.0,80,82,91,74,Right,Med,Low,4,4,134382881,14,0.93,0.57,2019-04-08,0


In [15]:
df_prices.to_csv('data/prices_database.csv')