In [1]:
# Temp for local development
import ssl
import os
# os.environ['PYSPARK_PYTHON'] = '/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6'
import tinydb as tinydb
ssl._create_default_https_context = ssl._create_unverified_context

In [7]:
%%bash
# The Search for Categorical Correlation
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://github.com/shakedzy/dython
# http://shakedzy.xyz/dython/
if ! pip3 list | grep dython; then
    pip3 install dython
fi

dython              0.6.1


In [8]:
%%bash
if ! pip3 list | grep tinydb; then
    pip3 install tinydb
fi

tinydb              4.2.0


In [None]:
import numpy as np
import pandas as pd
import seaborn as sn

from pyspark import SparkContext, SparkFiles, SQLContext
from pyspark.sql.types import StructType, StructField, StringType, BooleanType

from pyspark.sql import functions as F
from pyspark.sql import types as T

from itertools import chain
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
# https://github.com/shakedzy/dython
from dython.nominal import associations

In [None]:
sc = SparkContext.getOrCreate()
spark = SQLContext(sc)

In [None]:
import os
import json
dataDir = '/Users/duane.hinkley/PycharmProjects/c772-capstone-project/juniper/.data/'



if not os.path.exists(dataDir):
    os.makedirs(dataDir)

def init_df_raw():
    dfRaw = import_by_url('https://github.com/dlhinkley/c772-capstone-project/raw/master/data/assessment_items.csv')

    # Save to reuse
    save_df(dfRaw, 'dfRaw')

    # Only keep "fully scored" items
    # Filter to learner_attempt_status = 'fully scored'
    dfFlt = dfRaw.filter(F.col('learner_attempt_status') == 'fully scored')

    # Change Date Fields from String to Timestamp Type
    types = get_var_types()
    for f in types['intervalVars']:
      dfFlt = dfFlt.withColumn(f, F.col(f).cast(T.TimestampType() ) )

    # Set default date values to null (years 2999 and 1900)
    # Set empty dates to null
    for f in types['intervalVars']:
      # Change to empty if date is more than 30 months in past or future
      dfFlt = dfFlt.withColumn(f, F.when( F.abs(F.months_between(F.col(f), F.current_timestamp() )) > 30, None ).otherwise( F.col(f) ) )

    # Save to reuse
    save_df(dfFlt, 'dfFlt')

    return dfFlt


def init_df_desc():
    dfDesc = import_by_url('https://github.com/dlhinkley/c772-capstone-project/raw/master/data/descriptions.csv')
    save_df(dfDesc, 'dfDesc')
    return dfDesc


def save_df(df, name):
    df.repartition(1).write.mode('overwrite').parquet(dataDir + name + ".parquet")

def save_dict(data, name):
    with open(dataDir + name + ".json", "w") as f:
      json.dump(data, f)

def load_dict(name):
    with open(dataDir + name + ".json") as f:
        out = json.load(f)

    return out

def load_df(name):
    return spark.read.parquet(dataDir + name + ".parquet")

In [None]:
def get_non_string_vars():
    global shared
    return group.identifierVars + group.continousVars + group.intervalVars + group.binaryVars

def get_all_vars():
        return group.nominalVars + get_non_string_vars();

In [None]:
from tinydb import TinyDB, Query
# Create Todo list
def init_todo():
  global td, dataDir
  td = TinyDB(dataDir + 'todo.json')

def add_todo(desc):
  global td
  q = Query()
  if not td.contains(q.todo == desc):
      td.insert({'todo': desc, 'finished': False})

def list_todo():
  global td
  for item in td:
     print(item)

def finish_todo(desc):
  global td
  q = Query()
  td.update({'finished': True}, q.todo == desc)

def delete_todo(desc):
  global td
  q = Query()
  td.remove(q.todo == desc)


init_todo()

In [None]:
import os
from pyspark import SparkFiles
from datetime import datetime

def import_by_url(url):
  # Given a url to a csv file, import and return a dataframe
  #
  sc.addFile(url)
  filename = os.path.basename(url)
  file = "file://" + SparkFiles.get(filename)
  return spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(file)


def filter_default(dfIn, f1, f2):
  # Given a dataframe and two date field names, returns the dataframe removing records
  # where the f1 or f2 columns equal a default date
  defaultDates = ["2999-01-01 00:00:00", "1900-01-01 00:00:00"]
  return dfIn.filter( ~F.col(f1).isin(defaultDates) & ~F.col(f2).isin(defaultDates) )


def date_stats(dfIn, f1, f2):
  # Given a dataframe and two date field names, returns a new dataframe with the difference between
  # the dates in minutes, hours and minutes
  dfOut = filter_default(dfIn, f1, f2)

  dfOut = dfOut.withColumn("minues", (F.col(f1).cast("long") - F.col(f2).cast("long"))/60.).select(f1, f2, "minues")

  dfOut = dfOut.withColumn("hours", (F.col(f1).cast("long") - F.col(f2).cast("long"))/3600.).select(f1, f2, "hours", "minues")

  return dfOut.withColumn("days", (F.col(f1).cast("long") - F.col(f2).cast("long"))/86400.).select("days", "hours", "minues")


def annotate_plot(ax):
  # Add total labels to plot
  for p in ax.patches:
      ax.annotate(
        round(p.get_height(), 2),
        (p.get_x()+p.get_width()/2., p.get_height()),
        ha='center',
        va='center',
        color='white',
        fontweight='bold',
        xytext=(0, -10),
        textcoords='offset points')


def date_boxplot(pdDf, title, ax = False):
    # Given a dataframe of dates, create a boxplot of
    # date distribution

    if ax:
        pdDf.boxplot(rot=270, figsize=[10,10], ax = ax)
    else:
        ax = pdDf.boxplot(rot=270, figsize=[10,10])

    max = pdDf.max().max()
    min = pdDf.min().min()
    ytick = np.linspace(start=min, stop=max, num=12)
    newLabels = [datetime.fromtimestamp(ts).strftime('%Y-%m-%d') for ts in ytick]
    ax.set_yticks(ytick)
    ax.set_yticklabels(labels=newLabels)
    ax.set_title(title)


def single_val(df):
    """ Give a dataframe, and a column to group by return a list of
        single value variables
    """

    inCols = []

    for c in df.columns:
        if df[c].unique().size == 1:
           inCols.append(c)

    return inCols

def display_sv_cols(df, cols):
    return df.select(cols).toPandas().head(1).transpose()

def id_to_name(df, idVar, newVar, newIdList):
  # Given a dataframe, id variable, new variable name and list of new ids
  # add a new variable to the dataframe mapping the id to the array

  # Save org ids to a list
  oldIdList = [row[idVar] for row in df.select(idVar).distinct().orderBy(idVar).collect()]

    # Create map
  newIdMap = dict()
  # Add letters to map
  for i, val in enumerate(oldIdList):
      newIdMap[val] = newIdList[i]

  # Create mapping expression
  mapping_expr = F.create_map([F.lit(x) for x in chain(*newIdMap.items())])

  # Add org column with letter related to id
  return df.withColumn(newVar, mapping_expr[df[idVar]])


In [None]:
# Return elements in whitelist
def whitelist(l, whitelist):
    if whitelist:
        return l[ np.isIn(whitelist)]
    else:
        return l


# Return a dictionary including arrays of variable names for each category
# If dfColumns provied, return only values in dfColumns
def get_var_cats(dfColumns = False):
    cat = dict()
    dfPd = load_df('dfDesc').toPandas()

    cat['orgVars']               = whitelist( dfPd.loc[ dfPd['category'] == 'Organization' ].field.tolist(), dfColumns)
    cat['sectionVars']           = whitelist( dfPd.loc[ dfPd['category'] == 'Section' ].field.tolist(), dfColumns)
    cat['learnerVars']           = whitelist( dfPd.loc[ dfPd['category'] == 'Learner' ].field.tolist(), dfColumns)
    cat['assessmentVars']        = whitelist( dfPd.loc[ dfPd['category'] == 'Assessment' ].field.tolist(), dfColumns)
    cat['assignmentVars']        = whitelist( dfPd.loc[ dfPd['category'] == 'Assignment' ].field.tolist(), dfColumns)
    cat['itemVars']              = whitelist( dfPd.loc[ dfPd['category'] == 'Item' ].field.tolist(), dfColumns)
    cat['assignmentAttemptVars'] = whitelist( dfPd.loc[ dfPd['category'] == 'Assignment Attempt' ].field.tolist(), dfColumns)
    cat['itemAttemptVars']       = whitelist( dfPd.loc[ dfPd['category'] == 'Item Attempt' ].field.tolist(), dfColumns)

    return cat

# Return a dictionary including arrays of variable names for each type
# If dfColumns provied, return only values in dfColumns
def get_var_types(dfColumns = False):
    type = dict()
    dfPd = load_df('dfDesc').toPandas()

    type['identifierVars']  = whitelist( dfPd.loc[ dfPd['type'] == 'Categorical Identifier' ].field.tolist(), dfColumns)
    type['nominalVars']     = whitelist( dfPd.loc[ dfPd['type'] == 'Categorical Nominal' ].field.tolist(), dfColumns)
    type['continousVars']   = whitelist( dfPd.loc[ dfPd['type'] == 'Numeric Continuous' ].field.tolist(), dfColumns)
    type['intervalVars']    = whitelist( dfPd.loc[ dfPd['type'] == 'Categorical Interval' ].field.tolist(), dfColumns)
    type['binaryVars']      = whitelist( dfPd.loc[ dfPd['type'] == 'Categorical Binary' ].field.tolist(), dfColumns)

    return type



# Given a dataframe of datetime fields, return a matrix of the mean difference
#
def date_diff_map(df, title, scale = 'D'):

    intVars = df.columns
    intSize = len(intVars)
    am      = pd.DataFrame(np.zeros(shape=(intSize , intSize)), columns = intVars, index = intVars)

    for v1 in intVars:
        for v2 in intVars:
            if v1 != v2:

                diff = ( df[v1] - df[v2] ) / np.timedelta64(1,scale)
                am.at[v1,v2] = round( diff.mean())

    ax = sn.heatmap(am, annot=True, fmt=".0f")
    ax.set_title(title)