In [54]:
import numpy as np
import pandas as pd

In [55]:
files = []
users_i = [0 for i in range(29)]
for i in range(29):
    tempdf = pd.read_csv('data/User{}.csv'.format(i+1), delimiter = ',')
    tempdf['User'] = i
    files.append(tempdf)
    users_i[i] = len(tempdf)
df = pd.concat(files, ignore_index = True)

print(users_i)

[100, 100, 91, 100, 35, 100, 100, 100, 100, 100, 31, 100, 100, 100, 39, 70, 72, 100, 38, 64, 79, 100, 37, 38, 100, 33, 100, 54, 46]


In [56]:
df.dtypes

Game Completed Date            object
Team                           object
Outcome                        object
Task Completed                 object
All Tasks Completed            object
Murdered                       object
Imposter Kills                 object
Game Length                    object
Ejected                        object
Sabotages Fixed               float64
Time to complete all tasks     object
Rank Change                    object
Region/Game Code               object
User                            int64
dtype: object

In [57]:
df[['Task Completed', 'Imposter Kills']] = df[['Task Completed', 'Imposter Kills']].apply(pd.to_numeric, errors = 'coerce')


In [58]:
df['Region'] = df['Region/Game Code'].str.extract(r'^(.*?)/')
df['Region'].value_counts()


NA         1436
Europe      791
Name: Region, dtype: int64

In [59]:
df['Game Length'] = df['Game Length'].apply(pd.to_timedelta)
df['Game Length'] = df['Game Length'] / np.timedelta64(1, 's')

In [60]:
df['Time to complete all tasks'] = df['Time to complete all tasks'].apply(pd.to_timedelta, errors = 'coerce')
df['Time to complete all tasks'] = df['Time to complete all tasks'] / np.timedelta64(1, 's')

In [61]:
crewmate = df[['Team',
               'Outcome',
               'Task Completed',
               'All Tasks Completed',
               'Murdered',
               'Game Length',
               'Ejected',
              'Sabotages Fixed',
              'Region',
              'User']][df['Team'] == 'Crewmate']
crewmate.head()

Unnamed: 0,Team,Outcome,Task Completed,All Tasks Completed,Murdered,Game Length,Ejected,Sabotages Fixed,Region,User
0,Crewmate,Win,3.0,No,Yes,424.0,No,2.0,,0
1,Crewmate,Loss,7.0,Yes,No,981.0,No,1.0,,0
2,Crewmate,Win,3.0,No,No,693.0,No,0.0,,0
4,Crewmate,Loss,4.0,No,No,310.0,No,0.0,Europe,0
5,Crewmate,Loss,7.0,Yes,Yes,982.0,No,0.0,Europe,0


In [62]:
crewmate['Outcome'] = crewmate['Outcome'].replace(['Loss', 'Win'],[0, 1])
crewmate['All Tasks Completed'] = crewmate['All Tasks Completed'].replace(['No', 'Yes'],[0, 1])
crewmate['Murdered'] = crewmate['Murdered'].replace(['No', 'Yes'],[0, 1])
crewmate['Ejected'] = crewmate['Ejected'].replace(['No', 'Yes'],[0, 1])
crewmate['Region'] = crewmate['Region'].replace(['NA ', 'Europe '],[0, 1])
crewmate.head()
crewmate.shape

(1761, 10)

In [63]:
crewmates_i = list(crewmate["User"].value_counts(sort = False))

In [64]:
def user_train_valid_split(users_i,valid_ratio,possible=False):
    n = sum(users_i)
    target = int(np.floor(valid_ratio*n))
    while not possible:
        possible, validation_indices = user_split(users_i,target)
        target -= 1
    training_indices = [i for i in range(len(users_i)) if i not in validation_indices]
    return training_indices, validation_indices

def user_split(users_i,target):  
    # Reference: https://levelup.gitconnected.com/dynamic-programming-subset-sum-c386126621cd
    n = len(users_i)
    solution = [[False for j in range(int(target+1))] for i in range(n+1)]
    # base cases
    for i in range(n):
        solution[i][0] = True
    # other cases
    for i in range(1,n+1):
        for j in range(1,target+1):
            solution[i][j] = solution[i-1][j]
            if(solution[i][j] == False and j >= users_i[i-1]):
                solution[i][j] = solution[i][j] or solution[i-1][j-users_i[i-1]]
    # check if the subset sum is possible
    possible = solution[len(users_i)][target]
    subset = []
    if not possible: return possible, subset
    # return the subset solution if one exists
    y = len(users_i)
    x = target
    while x != 0:
        if solution[y-1][x] == False:
            subset.append(y-1)
            x -= users_i[y-1]
        else: 
            y -= 1
    return possible, subset

In [65]:
test_ratio = .25
training_indices, test_indices = user_train_valid_split(crewmates_i,valid_ratio)

#check
train_sum = sum([crewmates_i[i] for i in training_indices])
test_sum = sum([crewmates_i[i] for i in test_indices])
print("expected valid ratio: {}, actual: {}".format(valid_ratio, valid_sum/(train_sum+valid_sum)))

train_data = crewmate.loc[crewmate['User'].isin(training_indices)]
test_data = crewmate.loc[crewmate['User'].isin(test_indices)]
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

expected valid ratio: 0.25, actual: 0.29621736814065


In [66]:
train_data

Unnamed: 0,Team,Outcome,Task Completed,All Tasks Completed,Murdered,Game Length,Ejected,Sabotages Fixed,Region,User
0,Crewmate,1,3.0,0,1,424.0,0,2.0,0,0
1,Crewmate,0,7.0,1,0,981.0,0,1.0,0,0
2,Crewmate,1,3.0,0,0,693.0,0,0.0,0,0
3,Crewmate,0,4.0,0,0,310.0,0,0.0,1,0
4,Crewmate,0,7.0,1,1,982.0,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...
1316,Crewmate,0,5.0,0,0,660.0,1,2.0,1,28
1317,Crewmate,1,7.0,1,0,851.0,0,1.0,0,28
1318,Crewmate,1,7.0,1,0,1185.0,0,1.0,1,28
1319,Crewmate,1,7.0,1,0,618.0,0,0.0,1,28


In [67]:
test_data

Unnamed: 0,Team,Outcome,Task Completed,All Tasks Completed,Murdered,Game Length,Ejected,Sabotages Fixed,Region,User
0,Crewmate,1,4.0,0,0,102.0,0,0.0,0,3
1,Crewmate,1,4.0,0,0,114.0,0,0.0,0,3
2,Crewmate,1,4.0,0,0,97.0,0,0.0,0,3
3,Crewmate,0,4.0,0,1,161.0,0,0.0,0,3
4,Crewmate,1,4.0,0,0,134.0,0,1.0,0,3
...,...,...,...,...,...,...,...,...,...,...
435,Crewmate,0,8.0,1,1,1262.0,0,1.0,1,9
436,Crewmate,1,4.0,0,0,370.0,0,0.0,1,9
437,Crewmate,1,8.0,1,1,1264.0,0,0.0,1,9
438,Crewmate,1,8.0,1,0,754.0,0,0.0,1,9
