In [3]:
import pandas as pd
import numpy as np

class FeatureFileGenerator(object):
    def __init__(self):
        np.random.seed(0) # seed so we always get same answer
    
    def new_file(self, filename):
        t_df = pd.read_csv('data/TourneyCompactResults.csv')
        t_df = self.randomize_teams(t_df)
        t_df['TourneyGame'] = np.full(t_df.shape[0], 1)
                
        s_df = pd.read_csv('data/RegularSeasonCompactResults.csv')
        s_df = self.randomize_teams(s_df)
        s_df['TourneyGame'] = np.full(s_df.shape[0], 0)
        
        df = pd.concat([t_df, s_df], ignore_index=True)
        df.to_csv(filename, encoding='ascii')
        
    def randomize_teams(self, df):
        df = df[df['Season'] > 2002]
        
        Ngames = df.shape[0]
        df['team0Win'] = np.random.randint(2,size=Ngames)
        
        for feature in [self.team0, self.team1]:
            df[feature.__name__] = df.apply(feature, axis=1)
        
        return df

    def team0(self, game):
        if game['team0Win'] == 1:
            return game['Wteam']
        else:
            return game['Lteam']

    def team1(self, game):
        if game['team0Win'] == 1:
            return game['Lteam']
        else:
            return game['Wteam']

In [4]:
import ipyparallel as ipp
c = ipp.Client()
view = c.direct_view()

In [5]:
%%px --local

import numpy as np
import pandas as pd

class FeatureAdder(object):
    def __init__(self, df):
        self.df = df
        
    def feature01(self, feature):
        def mod(game):
            f0 = team_features_df.loc[(team_features_df['Team_Id'] == game['team0']) & in_season[game['Season']], feature.__name__].values[0]
            f1 = team_features_df.loc[(team_features_df['Team_Id'] == game['team1']) & in_season[game['Season']], feature.__name__].values[0]
            return pd.Series({feature.__name__+'0':f0, feature.__name__+'1':f1})
        return mod
     
    def massey(self):
        self.massey_init()
        self.df = pd.concat([self.df, self.df.apply(self.massey_game, axis=1)], axis=1)

    def massey_init(self):
        self.massey_df = pd.read_csv('data/massey_ordinals_2003-2015.csv')
        
        # Create dictionaries for boolean masks

        self.massey_seasons = self.massey_df['season'].unique()
        self.in_massey_season = dict(zip(self.massey_seasons, [self.massey_df['season'] == i for i in self.massey_seasons]))

        self.massey_day_nums = self.massey_df['rating_day_num'].unique()
        self.is_massey_day_num = dict(zip(self.massey_day_nums, [self.massey_df['rating_day_num'] == i for i in self.massey_day_nums]))

        self.massey_teams = self.massey_df['team'].unique()
        self.is_massey_team = dict(zip(self.massey_teams, [self.massey_df['team'] == i for i in self.massey_teams]))

        self.massey_systems = self.massey_df['sys_name'].unique()
        self.is_massey_system = dict(zip(self.massey_systems, [self.massey_df['sys_name'] == i for i in self.massey_systems]))

    def massey_game(self, game): # add here any features specific to a game, i.e., how do teams interact       
        try:
            daynum = game['Daynum']
        except:
            daynum = 134 # submission file games don't have daynum, but are all after day 133 (last day before tourney)

        latest_day = np.max(self.massey_day_nums[self.massey_day_nums < daynum]) # only take ordinals from dates that happened before the game, and take latest one of those
        ordinals0 = [self.getOrdinal(game['Season'], self.massey_systems[i], latest_day, game['team0']) for i in range(len(self.massey_systems))]
        ordinals1 = [self.getOrdinal(game['Season'], self.massey_systems[i], latest_day, game['team1']) for i in range(len(self.massey_systems))]
        f0 = pd.Series(ordinals0, index=[self.massey_systems[i]+'0' for i in range(len(self.massey_systems))])   
        f1 = pd.Series(ordinals1, index=[self.massey_systems[i]+'1' for i in range(len(self.massey_systems))])   
        return pd.concat([f0,f1])
    
    def getOrdinal(self, season, system, day, team):
        try:
            ordinal = np.float64(self.massey_df.loc[self.in_massey_season[season] & self.is_massey_system[system] & self.is_massey_day_num[day] & self.is_massey_team[team], 'orank'].values[0])
        except IndexError:
            ordinal = np.nan
        return ordinal

def add_features(df):
    dfa = FeatureAdder(df)
    dfa.massey()
    return dfa.df

In [93]:
f = FeatureFileGenerator()
f.new_file('data/fulldata.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
df = pd.read_csv('data/fulldata.csv', index_col=0)

In [15]:
df.shape

(66719, 12)

In [19]:
testdf = np.array_split(df,30000)[-1]

In [20]:
testdf.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,team0Win,team0,team1,TourneyGame
66717,2015,132,1433,71,1173,65,N,0,1,1433,1173,0.0
66718,2015,132,1458,80,1277,69,N,1,0,1277,1458,0.0


In [21]:
Ncores=2
dfsplit = np.array_split(testdf,Ncores)
        
import os
view.map_sync(os.chdir, [os.getcwd()]*Ncores)
c[:].apply_sync(os.getcwd)

['/Users/dtamayo/Documents/NCAAbasketball',
 '/Users/dtamayo/Documents/NCAAbasketball']

In [22]:
dfs = view.map_sync(add_features, dfsplit)

In [14]:
dfnew = pd.concat([df for df in dfs])
dfnew.tail()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,team0Win,team0,...,STF1,WMR1,PPR1,MPI1,STS1,UPS1,SPR1,MvG1,TRK1,BWE1
2,2003,136,1113,84,1272,71,N,0,1,1113,...,,,,,,,,,,
3,2003,136,1141,79,1166,73,N,0,0,1166,...,,,,,,,,,,
4,2003,136,1143,76,1301,74,N,1,1,1143,...,,,,,,,,,,
5,2003,136,1163,58,1140,53,N,0,1,1163,...,,,,,,,,,,
6,2003,136,1181,67,1161,57,N,0,1,1181,...,,,,,,,,,,


In [101]:
dfnew = pd.concat([df for df in dfs])
dfnew.to_csv('data/test.csv', encoding='ascii')