In [156]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import neighbors
from sklearn import linear_model
from sklearn import ensemble
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

pd.options.mode.chained_assignment = None 

In [157]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [158]:
def parse_df(df):
    df['distance_from_center'] = (df['XCoord']**2 + df['YCoord']**2)**(.5)
    
    return df

In [159]:
train = parse_df(train)
test = parse_df(test)

In [160]:
train.head()

Unnamed: 0,ID,XCoord,YCoord,Competitor,distance_from_center
0,111,0.943354,-0.184819,Bob,0.961288
1,92,0.121557,-0.927656,Bob,0.935586
2,20,0.818485,0.375437,Bob,0.900483
3,75,0.867462,-0.159851,Sue,0.882067
4,10,-0.817321,-0.373419,Kate,0.898585


In [161]:
train['FoldId'] = np.random.permutation((train.index % 5 + 1).tolist())

In [162]:
def make_meta(df):
    meta = df.copy()
    meta['M1'] = np.nan
    meta['M2'] = np.nan
    
    return meta

In [163]:
train_meta = make_meta(train)
test_meta = make_meta(test)

In [164]:
v = CountVectorizer()
train_meta['Competitor'] = np.argmax(v.fit_transform(train_meta['Competitor']), axis=1)
test_meta['Competitor'] = np.argmax(v.transform(test_meta['Competitor']), axis=1)

In [165]:
train_meta.head()

Unnamed: 0,ID,XCoord,YCoord,Competitor,distance_from_center,FoldId,M1,M2
0,111,0.943354,-0.184819,0,0.961288,2,,
1,92,0.121557,-0.927656,0,0.935586,1,,
2,20,0.818485,0.375437,0,0.900483,4,,
3,75,0.867462,-0.159851,3,0.882067,3,,
4,10,-0.817321,-0.373419,1,0.898585,2,,


In [166]:
rf_clf = ensemble.RandomForestClassifier()
gb_clf = ensemble.GradientBoostingClassifier()
nn_clf = neighbors.KNeighborsClassifier(n_neighbors=1)

folds_with_predictions = []

for i in range(1,6):
    train_fold = train_meta.loc[train_meta.FoldId != i]
    test_fold = train_meta.loc[train_meta.FoldId == i]
    
    X_train = train_fold[['XCoord', 'YCoord', 'distance_from_center']]
    y_train = train_fold['Competitor']
    X_test = test_fold[['XCoord', 'YCoord', 'distance_from_center']]
    y_test = test_fold['Competitor']
    
    rf_clf.fit(X_train, y_train)
    gb_clf.fit(X_train, y_train)
    
    M1 = rf_clf.predict(X_test)
    M2 = gb_clf.predict(X_test)
    
    test_fold['M1'] = M1
    test_fold['M2'] = M2
    folds_with_predictions.append(test_fold)

In [167]:
if pd.concat(folds_with_predictions).shape == train_meta.shape:
    train_meta = pd.concat(folds_with_predictions)

In [168]:
train_meta.head()

Unnamed: 0,ID,XCoord,YCoord,Competitor,distance_from_center,FoldId,M1,M2
1,92,0.121557,-0.927656,0,0.935586,1,0,0
6,42,0.573533,-0.207807,0,0.61002,1,3,1
7,177,0.757089,0.254411,3,0.798692,1,3,3
30,110,-0.243287,0.597556,1,0.645183,1,1,1
31,132,0.522435,0.663025,3,0.844121,1,3,3


In [169]:
final_clf = linear_model.LogisticRegression()

In [170]:
X_train = pd.concat([train_meta[['XCoord', 'YCoord', 'distance_from_center']],
                     pd.get_dummies(train_meta['M1'], prefix='M1'),
                     pd.get_dummies(train_meta['M2'], prefix='M2')],
                    axis=1)
y_train = train_meta['Competitor']


cv_acc = model_selection.cross_val_score(final_clf,
                                         X_train,
                                         y_train,
                                         cv=3,
                                         scoring='accuracy')

print(np.mean(cv_acc))

0.8063725490196078
