# Problem

- data does not contain year
- needs cleaning

### Solution

- Create function that does:
    1. create a column for given year
    2. create a full_year column that appends date and year column
    3. convert full_year column to datetime
    4. split result column with two colums given score of each team
    5. organize dataframe
    6. create winner column based on score of teams
    7. set and sort index to full_date column 

-  dataframes will be loaded one at a time given year

In [1]:
import pandas as pd
import numpy as np
import glob, os
import warnings
warnings.filterwarnings('ignore')

In [2]:
ls data/MLS/HOU/plo/

[0m[01;32mHOU_11_plo.csv[0m*  [01;32mHOU_12_plo.csv[0m*  [01;32mHOU_13_plo.csv[0m*  [01;32mHOU_17_plo.csv[0m*


In [3]:
plo_11 = 'data/MLS/HOU/plo/HOU_11_plo.csv'
plo_12 = 'data/MLS/HOU/plo/HOU_12_plo.csv'
plo_13 = 'data/MLS/HOU/plo/HOU_13_plo.csv'
plo_17 = 'data/MLS/HOU/plo/HOU_17_plo.csv'

## create dataframes per year

In [4]:
col = ['day','date','home_team','result','away_team']

df11 = pd.read_csv(plo_11, names=col)
df12 = pd.read_csv(plo_12, names=col)
df13 = pd.read_csv(plo_13, names=col)
df17 = pd.read_csv(plo_17, names=col)

## Create function
- joins dfs

In [5]:
df11['full_date'] = df11.date + '/2011'

df12['full_date'] = df12.date + '/2012'

df13['full_date'] = df13.date + '/2013'

df17['full_date'] = df17.date + '/2017'

In [6]:
df11['full_date'] =  pd.to_datetime(df11['full_date'])  # convert full_date to datetime
df12['full_date'] =  pd.to_datetime(df12['full_date'])  # convert full_date to datetime
df13['full_date'] =  pd.to_datetime(df13['full_date'])  # convert full_date to datetime
df17['full_date'] =  pd.to_datetime(df17['full_date'])  # convert full_date to datetime

## combine df
2010 - 2017

In [7]:
frames = [df11,df12,df13,df17]
dynamo_plo = pd.concat(frames)

In [8]:
dynamo_plo.head()

Unnamed: 0,day,date,home_team,result,away_team,full_date
0,Sun,10/30,Philadelphia Union,1-2,Houston Dynamo,2011-10-30
1,Thu,11/3,Houston Dynamo,1-0,Philadelphia Union,2011-11-03
2,Sun,11/6,Sporting Kansas City,0-2,Houston Dynamo,2011-11-06
3,Sun,11/20,Los Angeles Galaxy,1-0,Houston Dynamo,2011-11-20
0,Wed,10/31,Chicago Fire,1-2,Houston Dynamo,2012-10-31


In [9]:
dynamo_plo.apply(lambda x: sum(x.isnull()))

day          0
date         0
home_team    0
result       0
away_team    0
full_date    0
dtype: int64

In [10]:
dynamo_plo['home_score']= dynamo_plo['result'].str.split('-').apply(lambda x: x[0])  # split score vals
dynamo_plo['away_score']= dynamo_plo['result'].str.split('-').apply(lambda x: x[1])  # split score vals

## create a season col

In [11]:
dynamo_plo['season'] = 'plo'

In [12]:
dynamo_plo.head()

Unnamed: 0,day,date,home_team,result,away_team,full_date,home_score,away_score,season
0,Sun,10/30,Philadelphia Union,1-2,Houston Dynamo,2011-10-30,1,2,plo
1,Thu,11/3,Houston Dynamo,1-0,Philadelphia Union,2011-11-03,1,0,plo
2,Sun,11/6,Sporting Kansas City,0-2,Houston Dynamo,2011-11-06,0,2,plo
3,Sun,11/20,Los Angeles Galaxy,1-0,Houston Dynamo,2011-11-20,1,0,plo
0,Wed,10/31,Chicago Fire,1-2,Houston Dynamo,2012-10-31,1,2,plo


In [13]:
print(list(dynamo_plo.columns))

['day', 'date', 'home_team', 'result', 'away_team', 'full_date', 'home_score', 'away_score', 'season']


In [14]:
dynamo_plo =dynamo_plo[[ 'full_date','home_team','home_score','away_team', 'away_score', 'season']]

In [15]:
dynamo_plo = dynamo_plo.set_index('full_date').sort_index(ascending=True)  # set full_date as index

In [16]:
dynamo_plo.head()

Unnamed: 0_level_0,home_team,home_score,away_team,away_score,season
full_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-10-30,Philadelphia Union,1,Houston Dynamo,2,plo
2011-11-03,Houston Dynamo,1,Philadelphia Union,0,plo
2011-11-06,Sporting Kansas City,0,Houston Dynamo,2,plo
2011-11-20,Los Angeles Galaxy,1,Houston Dynamo,0,plo
2012-10-31,Chicago Fire,1,Houston Dynamo,2,plo


In [21]:
len(dynamo_plo.home_team.unique())

16

In [22]:
dynamo_plo.home_team.unique()

array([' Philadelphia Union  ', 'Houston Dynamo ',
       'Sporting Kansas City ', ' Los Angeles Galaxy   ', 'Chicago Fire ',
       'Houston Dynamo   ', 'Sporting Kansas City  ', 'Houston Dynamo  ',
       'Washington D.C. United    ', ' Los Angeles Galaxy          ',
       ' Houston Dynamo   ', 'Houston Dynamo       ',
       ' New York Red Bull', 'Sporting Kansas City     ',
       'Portland Tumbers', 'Seattle'], dtype=object)

In [26]:
dynamo_plo.home_team = dynamo_plo.home_team.str.strip()
len(dynamo_plo.home_team.unique())

9

In [27]:
dynamo_plo.home_team.unique()

array(['Philadelphia Union', 'Houston Dynamo', 'Sporting Kansas City',
       'Los Angeles Galaxy', 'Chicago Fire', 'Washington D.C. United',
       'New York Red Bull', 'Portland Tumbers', 'Seattle'], dtype=object)

In [28]:
len(dynamo_plo.away_team.unique())

11

In [29]:
dynamo_plo.away_team.unique()

array(['Houston Dynamo', 'Philadelphia Union', ' Houston Dynamo',
       'Sporting Kansas City', ' Washington D.C. United',
       'Montreal Impact', 'New York Red Bulls', 'Kansas City',
       'Portland Tumbers', 'Houston Dynamo ', 'Seattle'], dtype=object)

In [30]:
dynamo_plo.away_team = dynamo_plo.away_team.str.strip()
len(dynamo_plo.away_team.unique())

9

In [31]:
dynamo_plo.away_team.unique()

array(['Houston Dynamo', 'Philadelphia Union', 'Sporting Kansas City',
       'Washington D.C. United', 'Montreal Impact', 'New York Red Bulls',
       'Kansas City', 'Portland Tumbers', 'Seattle'], dtype=object)

## Save

In [32]:
dynamo_plo.to_csv('data/clean/dynamo_plo.csv')

In [37]:
ls data/clean

[0m[01;32mastros_clean.csv[0m*      [01;32mastros_clean_reg.csv[0m*  [01;32mdynamo_reg.csv[0m*  [01;32mUH_clean.csv[0m*
[01;32mastros_clean_plo.csv[0m*  [01;32mdynamo_plo.csv[0m*        [01;32mRice_clean.csv[0m*


## Join plo and reg season

In [38]:
path ='data/clean/dynamo_reg.csv'

In [39]:
dynamo_reg = pd.read_csv(path,index_col='full_date')
dynamo_reg.head()

Unnamed: 0_level_0,home_team,home_score,away_team,away_score,winner,season
full_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-03-27,FC Dallas,1,HOU,1,HOU,reg
2010-04-01,Houston Dynamo,2,Real Salt Lake,1,HOU,reg
2010-04-10,Houston Dynamo,0,LA Galaxy,2,LA Galaxy,reg
2010-04-17,Houston Dynamo,3,Chivas USA,0,HOU,reg
2010-04-24,Chicago,2,HOU,0,Chicago,reg


In [42]:
frames = [dynamo_regdynamo_plo]

dynamo = pd.concat(frames)

NameError: name 'dynamo_regdynamo_plo' is not defined

In [50]:

dynamo.to_csv('data/clean/dynamo_clean.csv')

In [51]:
dynamo.head()

Unnamed: 0_level_0,away_score,away_team,home_score,home_team,season,winner
full_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-10-30 00:00:00,2,Houston Dynamo,1,Philadelphia Union,plo,
2011-11-03 00:00:00,0,Philadelphia Union,1,Houston Dynamo,plo,
2011-11-06 00:00:00,2,Houston Dynamo,0,Sporting Kansas City,plo,
2011-11-20 00:00:00,0,Houston Dynamo,1,Los Angeles Galaxy,plo,
2012-10-31 00:00:00,2,Houston Dynamo,1,Chicago Fire,plo,


In [53]:
ls data/clean

[0m[01;32mastros_clean.csv[0m*      [01;32mastros_clean_reg.csv[0m*  [01;32mdynamo_plo.csv[0m*  [01;32mRice_clean.csv[0m*
[01;32mastros_clean_plo.csv[0m*  [01;32mdynamo_clean.csv[0m*      [01;32mdynamo_reg.csv[0m*  [01;32mUH_clean.csv[0m*
