In [1]:
# The goal is to take out-of-the-box models and apply them to different datasets. This project is awesome for 3 main reasons:

# First, you’ll build intuition for model-to-problem fit. Which models are robust to missing data? Which models handle categorical features well? Yes, you can dig through textbooks to find the answers, but you’ll learn better by seeing it in action.

# Second, this project will teach you the invaluable skill of prototyping models quickly. In the real world, it’s often difficult to know which model will perform best without simply trying them.

# Finally, this exercise helps you master the workflow of model building. For example, you’ll get to practice…

# Importing data
# Cleaning data
# Splitting it into train/test or cross-validation sets
# Pre-processing
# Transformations
# Feature engineering
# Because you’ll use out-of-the-box models, you’ll have the chance to focus on honing these critical steps.

#Questions to ask - predict market value or popularity based on factors such as teams 

In [2]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing

with open('epldata_final.csv', 'r') as csvfile:
         reader = csv.reader(csvfile, delimiter=' ', quotechar='|')


players = pd.read_csv('epldata_final.csv')
players = [players['age'], players['age']]

# print(housing.describe())



In [16]:
import pandas as pd
import numpy as np
data_path = 'epldata_final.csv'

footballers = pd.read_csv(data_path)
# visualise data
footballers.head()
# Below is Visual of age on market value
# footballers[:24*10].plot(y='market_value', x='age')

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
0,Alexis Sanchez,Arsenal,28,LW,1,65.0,4329,12.0,17.10%,264,3.0,Chile,0,4,1,1,0
1,Mesut Ozil,Arsenal,28,AM,1,50.0,4395,9.5,5.60%,167,2.0,Germany,0,4,1,1,0
2,Petr Cech,Arsenal,35,GK,4,7.0,1529,5.5,5.90%,134,2.0,Czech Republic,0,6,1,1,0
3,Theo Walcott,Arsenal,28,RW,1,20.0,2393,7.5,1.50%,122,1.0,England,0,4,1,1,0
4,Laurent Koscielny,Arsenal,31,CB,3,22.0,912,6.0,0.70%,121,2.0,France,0,4,1,1,0


In [21]:
# CLEAN DATA
# Why do we use dummy data?
dummy_fields = ['club', 'position', 'nationality', 'name', 'fpl_sel']
for each in dummy_fields:
    dummies = pd.get_dummies(footballers[each], prefix=each, drop_first=False)
    footballers = pd.concat([footballers, dummies], axis=1)

# dropping fields of no use to us to prep data    
fields_to_drop = ['club', 'position', 'nationality', 'name', 'fpl_sel']
data = rides.drop(fields_to_drop, axis=1)
data.head()

Unnamed: 0,age,position_cat,market_value,page_views,fpl_value,fpl_points,region,new_foreign,age_cat,club_id,big_club,new_signing
0,28,1,65.0,4329,12.0,264,3.0,0,4,1,1,0
1,28,1,50.0,4395,9.5,167,2.0,0,4,1,1,0
2,35,4,7.0,1529,5.5,134,2.0,0,6,1,1,0
3,28,1,20.0,2393,7.5,122,1.0,0,4,1,1,0
4,31,3,22.0,912,6.0,121,2.0,0,4,1,1,0


In [23]:
# Scale values - why do we do this???
quant_features = ['market_value', 'position_cat', 'age', 'page_views', 'fpl_points', 'big_club']
# Store scalings in a dictionary so we can convert back later
scaled_features = {}
for each in quant_features:
    mean, std = data[each].mean(), data[each].std()
    scaled_features[each] = [mean, std]
    data.loc[:, each] = (data[each] - mean)/std

In [24]:
# Importing data - This is done with the csv reader above
# The jobs now are to: 
# Cleaning data - checking for null values and values that are highly correlated to delete

# Splitting it into train/test or cross-validation sets - Done see below

# Pre-processing - To make sure the data being used is compatible
# Transformations 
# Feature engineering

In [37]:
# Data - what has happened to the data as in negative values etc
import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]


train_features, test_features = split_train_test(data, 0.2)
print(len(train_set), "train +", len(test_set), "test")



369 train + 92 test


In [38]:
print(train_set)

          age  position_cat  market_value  page_views  fpl_value  fpl_points  \
75   0.049276     -1.179971     -0.531274   -0.508450        6.0    1.199038   
17   1.311300     -0.180032      0.080601    0.192340        7.0   -0.363644   
254 -1.717556     -1.179971      0.325351    0.963960        7.0   -0.100059   
115 -0.203128      0.819906     -0.164149   -0.257325        5.5    0.766005   
207  0.806490     -1.179971     -0.408899   -0.313130        5.0   -0.627229   
111 -1.212747     -0.180032      0.406935    0.265316        5.0   -1.079089   
392 -0.455533     -1.179971      1.141185   -0.098493        6.5   -0.551919   
346 -0.960342     -1.179971      0.325351    0.082875        6.0   -0.589574   
227  0.806490     -0.180032     -0.327316    0.251365        4.5   -0.194197   
80   0.049276     -1.179971     -0.694441   -0.511669        5.5    0.445938   
251 -1.717556     -1.179971      1.549101    3.745656       10.5    0.182353   
196  0.554086     -1.179971      1.14118

In [39]:
print(test_set)

          age  position_cat  market_value  page_views  fpl_value  fpl_points  \
265  0.554086     -1.179971      1.549101    1.627188        7.0    0.841315   
412 -1.465152     -1.179971     -0.572066   -0.310984        5.5   -0.551919   
320 -0.203128     -0.180032     -0.082566   -0.603963        4.5   -0.363644   
332  0.301681      0.819906     -0.327316   -0.675867        5.0    0.860143   
333  0.806490     -1.179971     -0.653649   -0.259471        5.0    0.860143   
242 -1.212747     -1.179971      2.772852    1.406112        8.0    1.726208   
410 -0.203128      0.819906     -0.449691   -0.629720        4.5   -0.420127   
202 -1.465152     -1.179971     -0.164149   -0.183275        5.5    0.012906   
327 -1.465152      0.819906     -0.878003   -0.762795        4.5   -1.079089   
119  1.058895     -0.180032      0.325351   -0.330301        5.5    0.634213   
306  0.554086      1.819845      0.325351   -0.231568        5.0    1.443795   
460  0.049276     -1.179971     -0.08256

In [40]:
class NeuralNetwork(object):
    def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_input_to_hidden = np.random.normal(0.0, self.input_nodes**-0.5, 
                                       (self.input_nodes, self.hidden_nodes))

        self.weights_hidden_to_output = np.random.normal(0.0, self.hidden_nodes**-0.5, 
                                       (self.hidden_nodes, self.output_nodes))
        self.lr = learning_rate
        self.activation_function = lambda x : 1/(1 + np.exp(-x))
    
    def train(self, features, targets):
        ''' Train the network on batch of features and targets. 
        
            Arguments
            ---------
            
            features: 2D array, each row is one data record, each column is a feature
            targets: 1D array of target values
        
        '''
        n_records = features.shape[0]
        delta_weights_i_h = np.zeros(self.weights_input_to_hidden.shape)
        delta_weights_h_o = np.zeros(self.weights_hidden_to_output.shape)
        for X, y in zip(features, targets):
            ### Forward pass ###
            hidden_inputs = np.dot(X, self.weights_input_to_hidden)
            hidden_outputs = self.activation_function(hidden_inputs)
            final_inputs = np.dot(hidden_outputs, self.weights_hidden_to_output)
            # since the last layer just passes on its value, we don't have to apply the sigmoid here.
            final_outputs = final_inputs
            
            ### Backward pass ###
            error = y - final_outputs
            # The derivative of the activation function y=x is 1
            output_error_term = error * 1.0
            hidden_error = np.dot(self.weights_hidden_to_output, error) 
            # Backpropagated error terms
            hidden_error_term = hidden_error * hidden_outputs * (1 - hidden_outputs)
            # Weight step (input to hidden)
            delta_weights_i_h += hidden_error_term * X[:,None]
            # Weight step (hidden to output)
            delta_weights_h_o += output_error_term * hidden_outputs[:,None]
        # Weights update
        self.weights_hidden_to_output += self.lr*delta_weights_h_o/n_records
        self.weights_input_to_hidden += self.lr*delta_weights_i_h/n_records
        
    def run(self, features):
        ''' Run a forward pass through the network with input features 
        
            Arguments
            ---------
            features: 1D array of feature values
        '''
        # Forward pass
        hidden_inputs =  np.dot(features, self.weights_input_to_hidden)
        hidden_outputs = self.activation_function(hidden_inputs)
        final_inputs = np.dot(hidden_outputs, self.weights_hidden_to_output)
        final_outputs = final_inputs 
        return final_outputs

In [41]:
def MSE(y, Y):
    return np.mean((y-Y)**2)

In [42]:
import sys

### Set the hyperparameters here ###
iterations = 3000
learning_rate = 1.1
hidden_nodes = 15
output_nodes = 1

N_i = train_features.shape[1]
network = NeuralNetwork(N_i, hidden_nodes, output_nodes, learning_rate)

losses = {'train':[], 'validation':[]}
for ii in range(iterations):
    # Go through a random batch of 128 records from the training data set
    batch = np.random.choice(train_features.index, size=128)
    X, y = train_features.ix[batch].values, train_targets.ix[batch]['cnt']
                             
    network.train(X, y)
    
    # Printing out the training progress
    train_loss = MSE(network.run(train_features).T, train_targets['cnt'].values)
    val_loss = MSE(network.run(val_features).T, val_targets['cnt'].values)
    sys.stdout.write("\rProgress: {:2.1f}".format(100 * ii/float(iterations)) \
                     + "% ... Training loss: " + str(train_loss)[:5] \
                     + " ... Validation loss: " + str(val_loss)[:5])
    sys.stdout.flush()
    
    losses['train'].append(train_loss)
    losses['validation'].append(val_loss)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  app.launch_new_instance()


NameError: name 'train_targets' is not defined

In [19]:
import numpy as np
# This imports a csv from numpy
csv = np.genfromtxt ('epldata_final.csv', delimiter=",")
# This imports a column from the csv and takes from the 2nd element in the column
second = csv[1:,4]
third = csv[1:,2]
# This concatonates the columns so it stacks together
test_2D = np.column_stack((second, third))

print(test_2D)

[[ 1. 28.]
 [ 1. 28.]
 [ 4. 35.]
 [ 1. 28.]
 [ 3. 31.]
 [ 3. 22.]
 [ 1. 30.]
 [ 3. 31.]
 [ 3. 25.]
 [ 1. 21.]
 [ 2. 24.]
 [ 2. 23.]
 [ 2. 25.]
 [ 2. 26.]
 [ 2. 26.]
 [ 3. 26.]
 [ 3. 27.]
 [ 2. 32.]
 [ 1. 26.]
 [ 3. 21.]
 [ 2. 25.]
 [ 1. 28.]
 [ 4. 24.]
 [ 4. 28.]
 [ 3. 25.]
 [ 3. 32.]
 [ 3. 24.]
 [ 1. 26.]
 [ 1. 25.]
 [ 1. 34.]
 [ 3. 30.]
 [ 4. 37.]
 [ 3. 26.]
 [ 1. 27.]
 [ 3. 26.]
 [ 1. 23.]
 [ 3. 32.]
 [ 1. 24.]
 [ 2. 27.]
 [ 1. 30.]
 [ 1. 25.]
 [ 3. 22.]
 [ 2. 27.]
 [ 2. 30.]
 [ 1. 21.]
 [ 2. 20.]
 [ 1. 21.]
 [ 4. 32.]
 [ 1. 29.]
 [ 3. 24.]
 [ 3. 23.]
 [ 4. 30.]
 [ 4. 32.]
 [ 4. 25.]
 [ 3. 25.]
 [ 3. 25.]
 [ 3. 31.]
 [ 1. 36.]
 [ 3. 29.]
 [ 3. 33.]
 [ 3. 24.]
 [ 3. 30.]
 [ 1. 25.]
 [ 2. 27.]
 [ 2. 29.]
 [ 2. 34.]
 [ 2. 23.]
 [ 1. 25.]
 [ 2. 26.]
 [ 1. 27.]
 [ 2. 26.]
 [ 1. 33.]
 [ 1. 30.]
 [ 1. 28.]
 [ 4. 31.]
 [ 1. 27.]
 [ 1. 26.]
 [ 3. 27.]
 [ 3. 31.]
 [ 3. 28.]
 [ 1. 27.]
 [ 2. 25.]
 [ 2. 28.]
 [ 2. 29.]
 [ 2. 33.]
 [ 2. 28.]
 [ 1. 26.]
 [ 2. 25.]
 [ 3. 24.]
 [ 2. 27.]
 [ 3. 26.]

In [34]:

import unittest

inputs = np.array([[0.5, -0.2, 0.1]])
targets = np.array([[0.4]])
test_w_i_h = np.array([[0.1, -0.2],
                       [0.4, 0.5],
                       [-0.3, 0.2]])
test_w_h_o = np.array([[0.3],
                       [-0.1]])

class TestMethods(unittest.TestCase):
    
    ##########
    # Unit tests for data loading
    ##########

    
    ##########
    # Unit tests for network functionality
    ##########

    def test_activation(self):
        network = NeuralNetwork(3, 2, 1, 0.5)
        # Test that the activation function is a sigmoid
        self.assertTrue(np.all(network.activation_function(0.5) == 1/(1+np.exp(-0.5))))

    def test_train(self):
        # Test that weights are updated correctly on training
        network = NeuralNetwork(3, 2, 1, 0.5)
        network.weights_input_to_hidden = test_w_i_h.copy()
        network.weights_hidden_to_output = test_w_h_o.copy()
        
        network.train(inputs, targets)
        self.assertTrue(np.allclose(network.weights_hidden_to_output, 
                                    np.array([[ 0.37275328], 
                                              [-0.03172939]])))
        self.assertTrue(np.allclose(network.weights_input_to_hidden,
                                    np.array([[ 0.10562014, -0.20185996], 
                                              [0.39775194, 0.50074398], 
                                              [-0.29887597, 0.19962801]])))

    def test_run(self):
        # Test correctness of run method
        network = NeuralNetwork(3, 2, 1, 0.5)
        network.weights_input_to_hidden = test_w_i_h.copy()
        network.weights_hidden_to_output = test_w_h_o.copy()

        self.assertTrue(np.allclose(network.run(inputs), 0.09998924))

suite = unittest.TestLoader().loadTestsFromModule(TestMethods())
unittest.TextTestRunner().run(suite)

...
----------------------------------------------------------------------
Ran 3 tests in 0.004s

OK


<unittest.runner.TextTestResult run=3 errors=0 failures=0>

NameError: name 'train_features' is not defined

In [34]:

scaler = preprocessing.StandardScaler().fit(test_2D)
# Applying transformer to training dataPython

X_train_scaled = scaler.transform(test_2D)
 
print(X_train_scaled.mean(axis=0))

[1.10781473e-16 2.92908623e-17]


In [37]:
from sklearn.cluster import KMeans
import numpy as np
# X = np.array([[1, 2], [1, 4], [1, 0],
#               [4, 2], [4, 4], [4, 0]])
kmeans = KMeans(n_clusters=4, random_state=0).fit(test_2D)
kmeans.labels_

# kmeans.predict([[4, 23], [21, 21]])

kmeans.cluster_centers_

array([[ 2.77777778, 33.05555556],
       [ 1.95698925, 21.2688172 ],
       [ 2.09836066, 25.72677596],
       [ 2.11504425, 29.12389381]])

In [None]:
# Fot the above clustering algorithm you should sort the features as below
# https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn#step-5

# To do:
# numpy arrays figure out and use to create 2d array :/
# remove feature with letter resave features in dataset