In [1]:
import pandas as pd
import numpy as np


# plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

# other
from datetime import datetime

# Time Series

In [2]:
df = pd.read_csv('data/prices_database.csv', index_col='Unnamed: 0', parse_dates=['added_date', 'date'])
df.head(2)

Unnamed: 0,player_name,quality,revision,overall,club,league,nationality,position,age,height,weight,intl_rep,added_date,pace,pace_acceleration,pace_sprint_speed,dribbling,drib_agility,drib_balance,drib_reactions,drib_ball_control,drib_dribbling,drib_composure,shooting,shoot_positioning,shoot_finishing,shoot_shot_power,shoot_long_shots,shoot_volleys,shoot_penalties,passing,pass_vision,pass_crossing,pass_free_kick,pass_short,pass_long,pass_curve,defending,def_interceptions,def_heading,def_marking,def_stand_tackle,def_slid_tackle,physicality,phys_jumping,phys_stamina,phys_strength,phys_aggression,pref_foot,att_workrate,def_workrate,weak_foot,skill_moves,resource_id,num_games,avg_goals,avg_assists,date,price
0,Pelé,Gold - Rare,Icon,98,Icons,Icons,Brazil,CAM,77,173,70,0,2018-09-19,95.0,95,95,96.0,94,93,98,97,96,98,96.0,97,98,94,94,95,93,93.0,97,90,89,96,88,89,60.0,67,94,55,53,49,76.0,88,86,76,59,Right,High,Med,4,5,237067,12719,0.69,0.41,2018-09-20,0
1,Pelé,Gold - Rare,Icon,98,Icons,Icons,Brazil,CAM,77,173,70,0,2018-09-19,95.0,95,95,96.0,94,93,98,97,96,98,96.0,97,98,94,94,95,93,93.0,97,90,89,96,88,89,60.0,67,94,55,53,49,76.0,88,86,76,59,Right,High,Med,4,5,237067,12719,0.69,0.41,2018-09-21,0


### Data Processing

We need to construct the following features:
- available: Binary variable indicating the availability of the card on that given day
- substitute: Binary variable indicating whether a new card for that player is available during that day. (SBC + Packs)
- days: # of days available
- promo: binary indicating if there's a promotion on that day. 
- weekday: 0-6 indicating the day of the week
- month: 1-12
- source: Whether the card was an SBC, Weekly OBJ or in packs.

**Days**:

In [3]:
df['days'] = (df['date'] - df['added_date']).dt.days

Quick to note that some players have prices for the day before they were officially released, implying there's a mistake. Further investigation reveals that these should be offset to the next day instead. 

In [4]:
offset_resources = df[df.days == -1].resource_id.unique()       # get resource id of users who match this 

df['date'] = np.where(df.resource_id.isin(offset_resources),    # remap the date variable
                      df.date +pd.DateOffset(days=1), df.date)
df['days'] = (df.date - df.added_date).dt.days                  # reassign the days variable

df = df[df.days >= 0]                                           # remove observations that have days < 0

**Weekday**:

In [5]:
df['weekday'] = df.date.dt.weekday                              # get the day of the week

**Month**:

In [7]:
df['month'] = df.date.dt.month

**Promo**:

Promotional periods are the following:
- TOTS: 10/5-21/6
- Icon Release: 5/4-15/4
- Fut Bday: 22/3-30/3
- Carniball: 8/3-16/3
- Rating Refresh: 15/2-24/2
- Headliners: 1/2-8/2
- Future Stars: 18/1-25/1
- TOTY: 7/1-14/1
- Futmas: 14/12-24/12
- TOTGS: 7/12-14/12
- Black Friday: 23/11-26/11
- RTTF: 9/11-16/11
- Halloween: 19/10-26/10

In [6]:
promos = [[datetime(2019, 5, 10), datetime(2019, 6, 21)],       # TOTS
          [datetime(2019, 4, 5), datetime(2019, 4, 15)],        # icon release
          [datetime(2019, 3, 22), datetime(2019, 3, 30)],       # fut bday
          [datetime(2019, 3, 8), datetime(2019, 3, 16)],        # carniball
          [datetime(2019, 2, 15), datetime(2019, 2, 24)],       # rating refresh
          [datetime(2019, 2, 1), datetime(2019, 2, 8)],         # headliners
          [datetime(2019, 1, 18), datetime(2019, 1, 25)],       # ffs
          [datetime(2019, 1, 7), datetime(2019, 1, 14)],        # TOTY
          [datetime(2018, 12, 14), datetime(2018, 12, 24)],     # futmas
          [datetime(2018, 12, 7), datetime(2018, 12, 14)],      # totgs
          [datetime(2018, 11, 23), datetime(2018, 11, 26)],     # black friday
          [datetime(2018, 11, 9), datetime(2018, 11, 16)],      # rttf
          [datetime(2018, 10, 19), datetime(2018, 10, 26)]]     # halloween

def promo_assignment(ds):
    date = pd.to_datetime(ds)
    promo = 0
    for p in promos:
        if (p[0] <= date) & (p[1] >= date):
            promo = 1
            break
    return promo

df['promo'] = df['date'].apply(promo_assignment)                # apply the promo assignment

**Card Source**:

In [25]:
df.revision.fillna('MLS POTM', inplace=True)

In [69]:
df['source'] = 'packs'
df['source'] = np.where(df.revision.str.contains('Normal') | df.revision.str.contains('Icon') | df.revision.str.contains('Refresh'),
                        'base', df.source)                                                                     # Base cards
df['source'] = np.where(df.revision.str.contains('SBC') | df.revision.str.contains('POTM'), 'sbc', df.source)  # SBC
df['source'] = np.where(df.revision.str.contains('Loan'), 'loan', df.source)                                   # Loans
df['source'] = np.where(df.revision.str.contains('Ob'), 'objective', df.source)                                # Objectives

df = df[df.source != 'loan']                                                                                   # remove loan cards

**Availability**:
- Pack cards: will be labeled as available for a week after being added to the database. This isn't always accurate but an overall approximation.
- SBC: Availability will also be set for a week - again, an approximation. 
- Objectives: Weekly availability
- Winter Upgrades + Base Cards: Available throughout except periods where they have special cards in packs. Winter upgrades replace base cards. 

In [70]:
df['available'] = np.nan

# special cards in packs, sbcs and objectives
df['available'] = np.where(((df.source == 'packs') | (df.source == 'sbc') | (df.source == 'objective')) & (df.date <= df.added_date + pd.DateOffset(7)),
                           1, df.available)

In [92]:
def in_packs(row):
    if row['source'] == 'base':
        p_name = row['player_name']
        age = row['age']
        height = row['height']
        weight = row['weight']
        intl_rep = row['intl_rep']
        date = row['date']
        if df[(df.player_name == p_name) & (df.age == age) 
               & (df.height == height) & (df.weight == weight) 
               & (df.intl_rep == intl_rep) & (df.date == date) 
               & (df.source == 'packs') & (df.available == 1)].shape[0] > 0:
            return 0
        else:
            return 1
    else:
        return row['available']

In [94]:
df.apply(in_packs, axis=1)

KeyboardInterrupt: 

Might be better to create a cross product / leftjoin of basecards and packscards and then mark availability accordingly. 

**Substitute**:

In [8]:
df.revision.unique()

array(['Icon', 'Normal', 'IF', 'OTW', 'SBC', 'Europa Base',
       'Swap Deal Reward SBC', 'PL POTM', 'Flashback SBC', 'Halloween',
       'Bundes POTM', 'Halloween SBC', 'SIF', 'UCL LIVE', 'UEL LIVE',
       'Award Winner', 'UEL LIVE SBC', 'TIF', 'TOTGS', 'Europa TOTGS',
       'CL TOTT SBC', 'Hero', 'FUTmas SBC', 'FUTmas', 'TOTY',
       'TOTY Nominee Loan', 'TOTY Nominee SBC', 'TOTY Nominee', 'CL',
       'FUT Future Stars', 'SBC - Future Stars', 'Weekly Obj- FFS',
       'Weekly Obj- FS', 'MOTM', 'Headliners SBC', 'Headliners',
       'Headliners- Wk. Obj.', 'Premium SBC', 'Winter Refresh',
       'Europa MOTM', 'CL MOTM', 'CL SBC', 'Europa SBC', 'Carniball',
       'Carniball-SBC', 'Carniball-Wkly Obj', 'Carniball SBC', 'FIF',
       'FUT Birthday SBC', 'FUT Birthday', 'FUT Birthday Wkly Ob',
       'Award Winner Wkly Ob', nan], dtype=object)

In [9]:
df[df.revision.isnull()]

Unnamed: 0,player_name,quality,revision,overall,club,league,nationality,position,age,height,weight,intl_rep,added_date,pace,pace_acceleration,pace_sprint_speed,dribbling,drib_agility,drib_balance,drib_reactions,drib_ball_control,drib_dribbling,drib_composure,shooting,shoot_positioning,shoot_finishing,shoot_shot_power,shoot_long_shots,shoot_volleys,shoot_penalties,passing,pass_vision,pass_crossing,pass_free_kick,pass_short,pass_long,pass_curve,defending,def_interceptions,def_heading,def_marking,def_stand_tackle,def_slid_tackle,physicality,phys_jumping,phys_stamina,phys_strength,phys_aggression,pref_foot,att_workrate,def_workrate,weak_foot,skill_moves,resource_id,num_games,avg_goals,avg_assists,date,price,days,weekday,promo,month
631539,Carlos Vela,gold rare,,88,Los Angeles Football Club,Major League Soccer,Mexico,RW,30,177,77,3,2019-04-09,88.0,86,89,89.0,84,84,84,90,91,86,89.0,91,91,82,90,92,81,88.0,92,90,81,92,72,90,39.0,45,85,37,26,17,74.0,77,76,76,64,Left,High,Low,2,4,117609928,12,0.25,0.08,2019-04-09,0,0,1,1,4


In [10]:
df.date.max()

Timestamp('2019-04-09 00:00:00')

In [12]:
sub.head()

Unnamed: 0,player_name,age,height,weight,intl_rep,resource_id,added_date
0,Aaron Mooy,28,173,72,1,50526606,[2018-11-28T00:00:00.000000000]
1,Aarón Martín,21,180,72,1,236295,[2018-09-19T00:00:00.000000000]
2,Abate,31,180,73,2,168312,[2018-09-19T00:00:00.000000000]
3,Abderrazak Hamdallah,28,179,81,1,50545409,[2018-12-19T00:00:00.000000000]
4,Abdoulaye Doucouré,26,182,76,2,50539783,[2019-01-15T00:00:00.000000000]


In [80]:
df.groupby(['player_name', 'age', 'height', 'weight', 'intl_rep']).added_date.unique().reset_index().head()

Unnamed: 0,player_name,age,height,weight,intl_rep,added_date
0,Aaron Mooy,28,173,72,1,[2018-11-28T00:00:00.000000000]
1,Aaron Ramsey,28,183,76,3,[2019-04-06T00:00:00.000000000]
2,Aarón Martín,21,180,72,1,[2018-09-19T00:00:00.000000000]
3,Abate,31,180,73,2,[2018-09-19T00:00:00.000000000]
4,Abderrazak Hamdallah,28,179,81,1,"[2018-12-19T00:00:00.000000000, 2019-01-30T00:..."


In [11]:
sub = df.groupby(['player_name', 'age', 'height', 'weight', 'intl_rep', 'resource_id']).added_date.unique().reset_index()

In [13]:
max_cards = df.groupby(['player_name', 'age', 'height', 'weight', 'intl_rep']).resource_id.nunique().max()

In [14]:
max_cards

6

In [None]:
df.groupby(['player_name', 'age', 'height', 'weight', 'intl_rep']).resource_id.nunique()[df.groupby(['player_name', 'age', 'height', 'weight', 'intl_rep']).resource_id.nunique() == 6]

In [None]:
sub.added_date.shift(-2)