#### Data Exploration: Initialize Functions

In [2]:
#%sh
# The Search for Categorical Correlation
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://github.com/shakedzy/dython
# http://shakedzy.xyz/dython/
# DATABRICKS pip install dython

In [8]:
import numpy as np
import pandas as pd
import random

from pyspark import SparkContext, SparkFiles, SQLContext
from pyspark.sql.types import StructType, StructField, StringType, BooleanType

from pyspark.sql import functions as F
from pyspark.sql import types as T

from itertools import chain
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
# https://github.com/shakedzy/dython
from dython.nominal import associations


In [9]:
sc = SparkContext.getOrCreate()
spark = SQLContext(sc)


In [11]:
# Create Todo list
def init_todo():
  global todoList
  schema = StructType([
    StructField('todo', StringType(), True),
    StructField('finished', BooleanType(), True)
  ])
  # DATABRICKS todoList = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
  todoList = spark.createDataFrame(sc.emptyRDD(), schema)

def add_todo(desc):
  global todoList
  newRow = spark.createDataFrame([(desc,False)])
  todoList = todoList.union(newRow)
  
def list_todo():
  global todoList
  display(todoList)
  
def finish_todo(desc):
  global todoList
  todoList = todoList.withColumn(
    "finished",
    F.when(
        F.col("todo") == desc,
        True
     ).otherwise(F.col("finished"))
   )

init_todo()


In [14]:
import os
from pyspark import SparkFiles

def import_by_url(url):
  # Given a url to a csv file, import and return a dataframe
  #
  sc.addFile(url)
  filename = os.path.basename(url)
  file = "file://" + SparkFiles.get(filename)
  return spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(file)


def filter_default(dfIn, f1, f2):
  # Given a dataframe and two date field names, returns the dataframe removing records 
  # where the f1 or f2 columns equal a default date
  defaultDates = ["2999-01-01 00:00:00", "1900-01-01 00:00:00"]
  return dfIn.filter( ~F.col(f1).isin(defaultDates) & ~F.col(f2).isin(defaultDates) )


def date_stats(dfIn, f1, f2):
  # Given a dataframe and two date field names, returns a new dataframe with the difference between
  # the dates in minutes, hours and minutes
  dfOut = filter_default(dfIn, f1, f2)

  dfOut = dfOut.withColumn("minues", (F.col(f1).cast("long") - F.col(f2).cast("long"))/60.).select(f1, f2, "minues")

  dfOut = dfOut.withColumn("hours", (F.col(f1).cast("long") - F.col(f2).cast("long"))/3600.).select(f1, f2, "hours", "minues")

  return dfOut.withColumn("days", (F.col(f1).cast("long") - F.col(f2).cast("long"))/86400.).select("days", "hours", "minues")


def annotate_plot(ax):
  # Add total labels to plot
  for p in ax.patches:
      ax.annotate( 
        round(p.get_height(), 2),
        (p.get_x()+p.get_width()/2., p.get_height()),
        ha='center',
        va='center',
        color='white',
        fontweight='bold',
        xytext=(0, -10),
        textcoords='offset points')
      
      

def perfect_cor(df, groupCol):
    """ Give a dataframe, and a column to group by return a list of 
        perfectly correlated variables
    """

    exCols = []

    # Count distinct values of rows where assignment_start_date is null
    dfCounts = df.groupBy(groupCol).agg(*(F.countDistinct( F.when(F.col(c).isNull(), "Empty").otherwise(F.col(c).cast("string") ) ).alias(c) for c in df.columns))

    # Filter fields to those with count of 1
    for row in dfCounts.collect():
      for c in dfCounts.columns:
        if (row[c] != 1):
          exCols.append(c)


    exCols  = list(set(exCols)) # Get unique list
    allCols = dfCounts.columns
    inCols  = [col for col in allCols if col not in exCols] # Return cols not in exCols

    return inCols;

def id_to_name(df, idVar, newVar, newIdList):
  # Given a dataframe, id variable, new variable name and list of new ids
  # add a new variable to the dataframe mapping the id to the array
  
  # Save org ids to a list
  oldIdList = [row[idVar] for row in df.select(idVar).distinct().orderBy(idVar).collect()]

    # Create map
  newIdMap = dict()
  # Add letters to map 
  for i, val in enumerate(oldIdList): 
      newIdMap[val] = newIdList[i]

  # Create mapping expression
  mapping_expr = F.create_map([F.lit(x) for x in chain(*newIdMap.items())])

  # Add org column with letter related to id
  return df.withColumn(newVar, mapping_expr[df[idVar]])


#### Data Exploration: Initialize Data

In [None]:
dfRaw = import_by_url('https://github.com/dlhinkley/c772-capstone-project/raw/master/data/assessment_items.csv')

In [None]:
dfDesc = import_by_url('https://github.com/dlhinkley/c772-capstone-project/raw/master/data/descriptions.csv')