In [2]:
!pip install --upgrade --user pixiedust
import pixiedust
# pixiedust.enableSparkJobProgressMonitor()
# pixiedust.enableJobMonitor()

Requirement already up-to-date: pixiedust in c:\users\steph\appdata\roaming\python\python36\site-packages
Requirement already up-to-date: mpld3 in c:\users\steph\anaconda3\lib\site-packages (from pixiedust)
Requirement already up-to-date: lxml in c:\users\steph\anaconda3\lib\site-packages (from pixiedust)
Requirement already up-to-date: geojson in c:\users\steph\anaconda3\lib\site-packages (from pixiedust)
Pixiedust database opened successfully


In [3]:
from math import sqrt, log, exp
import csv
import itertools

from datetime import datetime

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.conf import SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import *

# For categorical variables
from pyspark.ml.feature import StringIndexer, OneHotEncoder

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.regression import LinearRegression, LinearRegressionModel

sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

conf = (SparkConf()
 .setMaster("local")
 .setAppName("Cleaner")
 .set("spark.executor.memory", "1g"))

In [4]:
def lengthInspector(rdd):
    """Checks rdd for empty rows.
    
    Key inputs:
    rdd --- a Spark rdd.
    """
    lenCounter = rdd.map(lambda x: (len(x), 1))\
                 .reduceByKey(lambda x, y: x + y)
    lenList = lenCounter.collect()
    return lenList

In [5]:
def buildMainKey(line):
    mainKey = "Farm " + line[1].strip() + " House " + line[2].strip() + " Flock " + line[3].strip()
    newline = [mainKey] + line
    return newline

In [6]:
def Do_Machine_Learning(trainingSet, testSet, predictor_cols, dependent_variable, regressor = LinearRegression(),
                         paramGrid = [], evalMetric = "rmse", seed = None):
    """
    Return (<object> model, <float> error_estimate, <dataframe> result)
    
    trainingSet: dataframe, used to train model
    testSet: dataframe, used to feed the model and get the result(and error estimate)
    xCols: list of strings, names of columns which used as inputs
    yValues: string, name of column that contains dependent values 
    regressor: Regression object, by default = LinearRegression()
    paramGrid: list built byParamGridBuilder, by default = empty list
    evalMetric: string, name of matrix used for evaluation, by default = "rmse"
    seed: int or None, seed for random number generator, if == None will use random numbers
    
    !!! seed is useless at that time !!!
    """
    # push estimator into pipeline
    vec = VectorAssembler(inputCols = predictor_cols, outputCol = "features")
    regPipeline = Pipeline()
    regPipeline.setStages([vec, regressor])   
    # build evaluator
    regEval = RegressionEvaluator(predictionCol = "Predicted_"+dependent_variable, labelCol = dependent_variable, 
                                  metricName = evalMetric)
    # combine estimator and evaluator to a cross validator
    crossval = CrossValidator(estimator = regPipeline, evaluator = regEval, numFolds = 3)
    # set parameters grid
    crossval.setEstimatorParamMaps(paramGrid)
    # trainning
    regModel = crossval.fit(trainingSet).bestModel
    # predicting
    predictions = regModel.transform(testSet)
    # get evaluating result
    evaluation = regEval.evaluate(predictions)
    
    return regModel, evaluation, predictions

In [7]:
# Convert the csv file to a tab delimited file -> makes life easier
with open('FF_broilers_v2.csv', 'r') as fin:
    with open('FF_broilers_v2_tab.txt', 'w') as fout:
        reader = csv.DictReader(fin)
        writer = csv.DictWriter(fout, reader.fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(reader)

In [10]:
# Read in the data in tab-delimited form
rdd = sc.textFile('FF_broilers_v2_tab.txt')

# Initialise the header for creating the dataframe
header = []

data = rdd.map(lambda x: x.split('\t'))
for col in data.take(1)[0]:
    header.append(col.strip()
                     .replace(' ', '')
                     .replace('#', '')
                     .replace('(', '')
                     .replace(')', '')
                     .replace('.', '')
                     .replace('/', 'Per')
                     .replace('%', 'Percentage'))
    
lenList = lengthInspector(data)
print("Distribution of length of lines in this file: ")
print(lenList)

data_clean = data.filter(lambda x: len(x) >= 38)

Distribution of length of lines in this file: 
[(38, 3678), (1, 3679)]


In [11]:
# Seperate the header from the rdd
rdd_header = data_clean.take(1)[0]
rdd_rows = data_clean.filter(lambda line: line != rdd_header)

broilersDF = sqlContext.createDataFrame(rdd_rows, header)

In [12]:
# Create a new dataframe of only the relevant columns
desired_cols = ['CustomerCode', 'Flock', 'House', 'GeneticLineCode', 'BirdsPresent', 'Mortality', 'BodyWeightg', 'DailyGaing', 
               'WheatPerBird', 'FeedIntakePerBirdg', 'WaterIntakePerBirdml']

df_new = broilersDF.select([c for c in broilersDF.columns if c in desired_cols])

In [13]:
# Remove any white space to the left or right of any entries
for col in df_new.schema.names:
    df_new = df_new.withColumn(col, ltrim(df_new[col]))
    df_new = df_new.withColumn(col, rtrim(df_new[col]))

In [14]:
# Create function to remove all spaces from a column's entries
spaceDeleteUDF = udf(lambda s: s.replace(" ", ""), StringType())


# Designate columns that are incorrectly encoded as strings
string_cols_incorrect = ['BirdsPresent', 'Mortality', 'BodyWeightg', 'DailyGaing', 'WheatPerBird',
                         'FeedIntakePerBirdg', 'WaterIntakePerBirdml']

# First clean the columns of spaces and then convert to Doubles
for col in string_cols_incorrect:
    df_new = df_new.withColumn(col, spaceDeleteUDF(col))
    df_new = df_new.withColumn(col, df_new[col].cast(DoubleType()))

In [15]:
categorical_columns = ['GeneticLineCode', 'CustomerCode', 'House']

##=== build stages ======
stringindexer_stages = [StringIndexer(inputCol=c, outputCol='stringindexed_' + c) for c in categorical_columns]
onehotencoder_stages = [OneHotEncoder(inputCol='stringindexed_' + c, outputCol='onehotencoded_' + c) 
                        for c in categorical_columns]
all_stages = stringindexer_stages + onehotencoder_stages

## build pipeline model
pipeline = Pipeline(stages=all_stages)

## fit pipeline model
pipeline_mode = pipeline.fit(df_new)

## transform data
df_coded = pipeline_mode.transform(df_new)

## remove uncoded columns
# selected_columns = ['onehotencoded_' + c for c in categorical_columns] + ['x4', 'y1', 'y2']
cols = df_coded.schema.names
cols_to_keep = [col for col in cols if col not in categorical_columns]
df_coded = df_coded.select(cols_to_keep)

In [16]:
# Drop all rows containing NA
df_coded = df_coded.na.drop()

In [17]:
## Normalise feed, water and wheat per bird wrt bird body weight
# Note that this also reduces the comlexity of physical dimensions (reverting to ratios of SI)
df_new = df_coded.withColumn("Feed_%_body_weight", df_coded['FeedIntakePerBirdg']/df_coded['BodyWeightg'])
df_new = df_new.withColumn("Water_%_body_weight", df_coded['WaterIntakePerBirdml']/df_coded['BodyWeightg'])
df_new = df_new.withColumn("Wheat_%_body_weight", df_coded['WheatPerBird']/df_coded['BodyWeightg'])

In [18]:
# Create test and training data
train, test = df_new.randomSplit([8.0, 2.0], 940309160050)
train = train.cache()
test = test.cache()

In [19]:
list(itertools.permutations([1, 2, 3]))

dependent_variable = 'Mortality'
predictors = [variable for variable in df_new.schema.names if variable != dependent_variable]
print(predictors)

# Want to have permutations such that Feed and Feed % body weight do not occur in the same list - true for all variaitons
# Also want stringindexed_ and onehotencoded_ variables together always.

['Flock', 'BirdsPresent', 'BodyWeightg', 'DailyGaing', 'WheatPerBird', 'FeedIntakePerBirdg', 'WaterIntakePerBirdml', 'stringindexed_GeneticLineCode', 'stringindexed_CustomerCode', 'stringindexed_House', 'onehotencoded_GeneticLineCode', 'onehotencoded_CustomerCode', 'onehotencoded_House', 'Feed_%_body_weight', 'Water_%_body_weight', 'Wheat_%_body_weight']


In [None]:
# build regressor
lr = LinearRegression()
lr.setPredictionCol("Predicted Mortality")\
   .setLabelCol("Death")

# build parameter grid
regParam = [x / 100.0 for x in range(1, 10)]
pg = (ParamGridBuilder()
             .addGrid(lr.regParam, regParam)
             .build())

import itertools
dependent_variable = 'Mortality'
predictors = [variable for variable in df_new.schema.names if variable != dependent_variable]

predictors = list(itertools.permutations([1, 2, 3]))

# run ML
model1, result1, predictionDF1 = Do_Machine_Learning(trainDF, testDF, ["Age", "ln"], "Death", lr, pg)
model2, result2, predictionDF2 = Do_Machine_Learning(trainDF, testDF, ["Age", "square"], "Death", lr, pg)
model3, result3, predictionDF3 = Do_Machine_Learning(trainDF, testDF, ["Age", "exp"], "Death", lr, pg)


# print attributions of model
"""
print("attributes of the model are: {}".format(dir(model)))
print("method list: {}".format([method for method in dir(model) if callable(getattr(model, method))]))
print(model.stages)
"""

# Print coefficients and intercept
weights1 = model1.stages[1].coefficients
ic1 = model1.stages[1].intercept
weights2 = model2.stages[1].coefficients
ic2 = model2.stages[1].intercept
weights3 = model3.stages[1].coefficients
ic3 = model3.stages[1].intercept
print(weights1, weights2, weights3)
print(ic1, ic2, ic3)
#print(list(zip(["Age"], weights1)))
#print(model.stages[1].intercept)

# print error and result
print("Mean Squared Error: {0:2.2f}, {1:2.2f}, {2:2.2f}\n".format(result1, result2, result3))
predictionDF1.show()
predictionDF2.show()
predictionDF3.show()

# print the model
# print(model.stages)