# References

#### Normal distribution for continous variables
$$
P(x) = \frac{1}{{\sigma \sqrt {2\pi } }}exp(\frac{ - \left( {x - \mu } \right)^2}{2\sigma ^2 })
$$

$$
\mu: \ \ mean;\ \ \sigma^2:\ \ variance
$$

#### Conditional probability

$$
P(X \mid C) = \frac{P(C \mid X)P(X)}{P(C)}
$$

#### Bayes' classifier for selecting a class

$$
P(C_{i}) \prod_{k}^{} P(X_{i}^k \mid C_i)
$$

# 1. Classifier Definition

In [None]:
# Classifier logic

import numpy as np
import pandas as pd

class Classifier:
    def __init__(self, training):
        # We are going to build two classifiers to help us classify winning team. 
        #   1st classifier will calculate team 1's winning probability
        #   2nd classifier will calculate team 2's winning probability
        
        # We split training data by Match Winner column because we would like to
        # infer the classifiers' coefficients(mean and variance) from related training data
        #   1st classifier is fed with statistics of matches in which team 1 won
        #   2nd classifier is fed with statistics of matches in which team 2 won
        grouped = training.groupby('Match Winner')
        fstTraining, sndTraining = grouped.get_group(0), grouped.get_group(1)

        # remove Match Winner column from training data sets
        fstTraining = fstTraining[fstTraining.columns[:-1]]
        sndTraining = sndTraining[sndTraining.columns[:-1]]

        # infer first classifier's coefficients by feeding it with team 1 winning matches statistics
        self.fstClassifierCofs = [ self.__normalDFCofs(fstTraining[col]) for col in fstTraining.columns ]

        # infer second classifier's coefficients by feeding it  with team 2 winning matches statistics
        self.sndClassifierCofs = [ self.__normalDFCofs(sndTraining[col]) for col in sndTraining.columns ]


    # calculate and return winning probability of each team
    def classify(self, x):
        fstPr, sndPr = 1, 1

        for idx, val in enumerate(x):
            fstMean, fstVariance = self.fstClassifierCofs[idx]
            sndMean, sndVariance = self.sndClassifierCofs[idx]
            
            fstPr *= self.__normalDF(val, fstMean, fstVariance)
            sndPr *= self.__normalDF(val, sndMean, sndVariance)
            
        return {0: fstPr, 1: sndPr}


    # calculate mean and variable of given data set
    def __normalDFCofs(self, bag):
        return (self.__mean(bag), self.__variance(bag))


    # calculate mean of given data set
    def __mean(self, bag):
        return sum(bag) / float(len(bag))


    # calculate variance of given data set
    def __variance(self, bag):
        mean = self.__mean(bag)
        return sum([ pow(iteam - mean, 2) for iteam in bag ]) / float(len(bag) - 1)


    # infer probability of a random variable x from normal distribution given mean and variance
    def __normalDF(self, x, mean, variance):
        return np.exp(-pow(x - mean, 2) / (2 * variance)) / np.sqrt(2 * np.pi * variance)



# 2. Helper Methods

In [None]:
# Utility functions
import numpy as np

def infer(score):
    if np.maximum(score[0], score[1]) == score[0]:
        return 0
    else:
        return 1

def hit(classifier, validating):
    inputs  = validating[validating.columns[:-1]].values
    winners = validating[validating.columns[-1]].values

    scores = [classifier.classify(x) for x in inputs]
    inferredWinners = [infer(score) for score in scores]

    return (inferredWinners == winners)


def accuracy(classifier, validating):
    hits = hit(classifier, validating).tolist()

    return round(100 * hits.count(True) / float(len(hits)), 2)



# 3. Data Preparation

In [None]:
# load US opening matches from local file
usdf = pd.read_csv('usopen.csv')

# load AUS opening matches from local file
ausdf = pd.read_csv('ausopen.csv')

# combine US and AUS opening matches' statistics
df = ausdf.append(usdf, ignore_index=True)

grouped = df.groupby('Match Winner')
fst, snd = grouped.get_group(0), grouped.get_group(1)


validating = df

# 10 team 1 winning matches and 10 team 2 winning matches
training = fst.head(10).append(snd.head(10))

# 4. Calculate accuracy of a classifier

In [None]:
classifier = Classifier(training)

accuracy(classifier, validating)

# 5. Display array of hit and missed classifications

In [None]:
classifier = Classifier(training)

hit(classifier, validating)