# Capstone Team Optimization 

**Automatic selection of Capstone teams based on preferences.**

for an optimization function we have to create a cost function    
we can add benfits and try to maximize    
or we can say that a cost of zero is a perfect team score and aim for minimization    
need to manipulate teams    

load data 

generate teams

talk about the costs of the teams and what that is

for example:
     - an optimal team size is 3 or 4, the abs val/sq diff can be added to the cost   
     - every team needs to have a coordinator (someone who has agreed) - add one if everyone says no, add 0.5 if everyone says 'if i have to', add 0 if everyone says yes
     - for the domain preferences - currently not ordered - take the set f the domains and the projects that people say they are interested, use jaccard score to add cost of differences
     - do the same distance scoring with experience - make R and Python as close together as possible
     - way of changing teams is swapping one person - start at a random state with 3 or 4 people in every team and the run swaps
     
     
### Expected fields

name
email
github username
linkedin url
completion plan
OS pref
languages
python level
R level
CLI
databases
role
coordinator
proj level
domains
dataset types

## Fields and Fixtures 

In [1]:
import os
import csv
import random

from itertools import chain
from collections import defaultdict, Counter

COHORT = 7      # Set to change the cohort to analyze. 
TEAM_SIZE = 4   # Optimal number of members per team

FIXTURES = os.path.join(os.getcwd(),"fixtures")

FIELDS = {'name' : 'Name',
          'email': 'Email',
          'github': 'Github Username',
          'linkedin': 'LinkedIn URL',
          'complete': 'I plan to complete the Capstone project this semester.',
          'os': 'What is your preferred operating system?',
          'language': 'What programming languages are you familiar with?',
          'python': 'What is your level of Python proficiency?',
          'r': 'What is your level of R proficiency?',
          'cli': 'What is your proficiency with the command line?',
          'dbs': 'What databases have you used before?',
          'role': 'Which of these roles would you like your primary contribution on the team to be?',
          'coord': 'Would you be willing to be a team coordinator?',
          'project': 'At what level do you feel your overall project should be at?',
          'domains': 'What domains are you interested in?',
          'datasets': 'What types of projects/data sets are you interested in?'
    }

PROG_ROLE  = 'Programmer - focused on the technical implementation'
STATS_ROLE = 'Statistician - focused on modeling and analysis'
DOM_ROLE   = 'Domain Expert - focused on finding novel data products for specific data sets'  

## Data Loading and Parsing

In [2]:
def getCohortPath(cohort=COHORT):
    """
    Returns the path to the Cohort file in the fixtures directory.
    """
    return os.path.join(FIXTURES,"cohort{}-preferences.csv".format(cohort))


def loadData(cohort=COHORT):
    """
    Loads and parses survey data. 
    """
    with open(getCohortPath(cohort), 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            row[FIELDS['complete']] = parseBool(row[FIELDS['complete']])
            row[FIELDS['language']] = parseMulti(row[FIELDS['language']])
            row[FIELDS['python']] = parseInt(row[FIELDS['python']])
            row[FIELDS['r']] = parseInt(row[FIELDS['r']])
            row[FIELDS['cli']] = parseInt(row[FIELDS['cli']])
            row[FIELDS['dbs']] = parseMulti(row[FIELDS['dbs']])
            row[FIELDS['coord']] = parseBool(row[FIELDS['coord']])
            row[FIELDS['project']] = parseInt(row[FIELDS['project']])
            row[FIELDS['domains']] = parseMulti(row[FIELDS['domains']])
            row[FIELDS['datasets']] = parseMulti(row[FIELDS['datasets']])
            yield dict([(field, row[FIELDS[field]]) for field in FIELDS])
                   
def parseBool(s):
    """
    Helper function for parsing yes/no/maybe. 
    """
    try:
        return {'yes': True,
                'no': False,
                'not sure': None,
                'if i have to': None
        }[s.lower()]
    except KeyError: 
        return None

def parseMulti(s):
    """
    Helper function for parsing survey lists (checkboxes). 
    """
    return filter(lambda i: i != '', [i.strip() for i in s.split(',')])

def parseInt(s):
    """
    Helper function for parsing integer fields. 
    """
    try:
        return int(s)
    except ValueError:
        return None

## Teams Collection 

A collection of teams and computation of team cost. 

In [3]:
class Cohort(object):
    
    def __init__(self, cohort=COHORT):
        self.teams = defaultdict(list)
        
        # Assign students to ordered teams.
        students = list(loadData(cohort))
        n_teams  = (len(students) / TEAM_SIZE)
        
        for idx, student in enumerate(students):
            self.teams[(idx + 1) / n_teams].append(student)
    
    def swap(self, source=None, target=None, sidx=None, tidx=None, transfer=False):
        """
        Swaps two students between two teams. If None values are passed,
        then the values are randomly selected. If transfer is true, then simply
        transfer the source to the target, don't swap. 
        """
        if source is None:
            source = random.choice(self.teams.keys())
        
        if target is None:
            target = random.choice(self.teams.keys())
        
        if sidx is None and len(self.teams[source]) > 1:
            sidx = random.randint(0, len(self.teams[source])-1)

        if tidx is None and len(self.teams[target]) > 1:
            tidx = random.randint(0, len(self.teams[target])-1)
        
        if sidx is not None:
            alpha = self.teams[source].pop(sidx)
            self.teams[target].append(alpha)
        
        if not transfer and tidx is not None:
            bravo = self.teams[target].pop(tidx)
            self.teams[source].append(bravo)
    
    def cost(self):
        """
        Computes the cost of the current team make up. 
        """
        cost = 0 # Perfect teams would have a cost of zero. 
        
        # Loop over each team to compute the costs.
        for team, prefs in self.teams.iteritems():
        
            # First add square difference in team size to optimal team size. 
            cost += (len(prefs) - TEAM_SIZE) ** 2
            
            # Add cost of multiple operating systems (1 OS is zero cost)
            cost += (len(set([pref['os'] for pref in prefs])) - 1)
            
            # Add cost of missing roles 
            cost += 3 - len(set([pref['role'] for pref in prefs]))
            
            # Add cost of domain mis-alignment 
            domains = Counter(chain(*[domain for domain in [pref['domains'] for pref in prefs]]))
            domains = domains.most_common(1)
            if domains:
                _, count = domains[0]
                cost += len(prefs) - count 
            else:
                cost += 99
            
            # Add cost of dataset mis-alignment
            datasets = Counter(chain(*[dataset for dataset in [pref['datasets'] for pref in prefs]]))
            datasets = datasets.most_common(1)
            if datasets:
                _, count = datasets[0]
                cost += len(prefs) - count 
            else:
                cost += 99
        
        return cost
    
    def select_coordinator(self, teamno):
        """
        From the people who selected yes to being coordinator, choose random.
        """
        # Filter out people who didn't say yes to coordinator role.
        coords = filter(
            lambda p: p['coord'] in (True, None), 
            self.teams[teamno]
        )

        return random.choice(coords)['name']


    def mean_level(self, teamno, field):
        """
        Compute the mean level of the given numeric field.
        """
        levels = [
            float(pref[field]) if pref[field] else 0.0
            for pref in self.teams[teamno]
        ]

        return sum(levels) / len(levels)
    

    def print_team(self, teamno):
        # Create output structure
        output = []

        # Create Title Header
        title = "Team {} Selection Report".format(teamno)
        output.append(title)
        output.append("-"*len(title))
        output.append("")

        # Print out averages
        output.append(
            "  * Coordinator: {}".format(self.select_coordinator(teamno))
        )
        output.append("")
        output.append(
            "  * Mean Python Level: {}".format(
                self.mean_level(teamno, 'python')
            )
        )
        output.append(
            "  * Mean R Level: {}".format(
                self.mean_level(teamno, 'r')
            )
        )
        output.append(
            "  * Mean CLI Level: {}".format(
                self.mean_level(teamno, 'cli')
            )
        )
        output.append(
            "  * Mean Project Level: {}".format(
                self.mean_level(teamno, 'project')
            )
        )
        output.append("")

        # Print out member names
        output.append("  - Members:")
        output.extend([
            "    + {} ({})".format(pref['name'], pref['email']) 
            for pref in self.teams[teamno]
        ])
        output.append("")

        # Print out domain preferences
        domains = Counter(chain(*[domain for domain in [pref['domains'] for pref in self.teams[teamno]]]))
        output.append("  - Domains:")
        output.extend([
            "    + {}: {}".format(*prefs) 
            for prefs in domains.most_common()
        ])
        output.append("")

        # Print out project preferences
        datasets = Counter(chain(*[dataset for dataset in [pref['datasets'] for pref in self.teams[teamno]]]))
        output.append("  - Project Types:")
        output.extend([
            "    + {}: {}".format(*prefs) 
            for prefs in datasets.most_common()
        ])
        output.append("")

        # Return report string
        return "\n".join(output)

In [4]:
cohort = Cohort()
print cohort.cost()

31


## Optimization

In [5]:
# Random Search Method
cohort = Cohort()

for _ in xrange(5000):
    # 100k searches 
    num_swaps = random.randint(10, 100)
    prob_xfer = 0.25 
    ncohort = Cohort()
    
    for _ in xrange(num_swaps):
        xfer = True if random.random() <= prob_xfer else False 
        ncohort.swap(transfer=xfer)
        if ncohort.cost() < cohort.cost():
            cohort = ncohort
        

# Loop over each team to compute the costs.
print cohort.cost()
for team in cohort.teams:
    print cohort.print_team(team)
    print
    print

20
Team 0 Selection Report
-----------------------

  * Coordinator: Khalil Ezzine

  * Mean Python Level: 2.5
  * Mean R Level: 4.0
  * Mean CLI Level: 4.75
  * Mean Project Level: 3.75

  - Members:
    + Joshua Wei (joshua.wei@yale.edu)
    + Khan Kashif (kashif_khan@bah.com)
    + Sushanta K Paul (sushantabd@gmail.com)
    + Khalil Ezzine (ezzine.khalil@gmail.com)

  - Domains:
    + Health Care/Medicine: 4
    + Government/Social Data: 3
    + Finance/Banking: 3
    + Energy: 2
    + Retail/Industry: 1
    + Security: 1

  - Project Types:
    + Clustering or Classification: 4
    + Text Analysis/Natural Language Processing: 4
    + Regression Analysis: 3
    + Statistical Modeling for Forecasting: 3
    + Time Series Analysis: 2
    + Visualization/Visual Analytics: 2
    + Network Analysis: 1



Team 1 Selection Report
-----------------------

  * Coordinator: Sarah Khederian

  * Mean Python Level: 2.5
  * Mean R Level: 1.83333333333
  * Mean CLI Level: 2.5
  * Mean Project Lev