In [None]:
##########################################################################################
### Purpose: To understand how to execute the different ways
###             of viewing data and data summaries in R
##########################################################################################

### 1.0.1 - load the packages
library(dplyr)
library(data.table)

### 1.0.2 - initialize file paths for working directories 

### 1.0.4 - import the data
temp <- tempfile()
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_6490/datasets/ILTCI%20PM%20workshop%20CTR%20data.zip", temp)
unzip(temp,"ILTCI PM workshop CTR data.csv",overwrite=TRUE)
ctr_data  <-fread("ILTCI PM workshop CTR data.csv", sep=",", header=TRUE) 
unlink(temp)


### 1.0.5 - view the data to see what we're working with
names(ctr_data)
str(ctr_data)
glimpse(ctr_data)
class(ctr_data)
head(ctr_data, 10)

- View the data types, Column Names


In [None]:
# 1.0.6 - view basic data summaries to see if there are any problems with the data
summary(ctr_data)
table(ctr_data$Cov_Type_Bucket)
table(ctr_data$GroupIndicator)
table(ctr_data$Gender)
table(ctr_data$Cov_Type_Bucket, ctr_data$GroupIndicator)
table(ctr_data$Cov_Type_Bucket, ctr_data$GroupIndicator, ctr_data$Gender)
table(ctr_data$Diagnosis_Category)

# 1.0.7 - dig into one of the fields, the tax-qualified status; status of unknown is frequent
tq_table <- table(ctr_data$TQ_Status)
tq_table / sum(tq_table)

# 1.0.8 - graph some of the fields for some basic data integrity and sanity checks
boxplot(ctr_data$ClaimDuration,
        main = "Boxplot of Data by Claim Duration",
        ylab = "Claim Duration"
        )
boxplot(ctr_data$Exposure,
        main = "Boxplot of Data by Exposure",
        ylab = "Exposure"
        )
situs_counts <- table(ctr_data$ClaimType)
barplot(situs_counts,
        main = "Bar Chart of Situs Counts",
        ylab = "Counts"
        )
tq_counts <- table(ctr_data$TQ_Status)
barplot(tq_counts,
        main = "Bar Chart of TQ Status Counts",
        ylab = "Counts")


In [None]:
# 1.0.9 - perform a basic summary of hazard rates, and view the results for reasonability
term_stats <- ctr_data %>%
                    select(ClaimDuration,
                           Exposure,
                           Terminations
                           ) %>%
                    group_by(ClaimDuration
                           ) %>%
                    summarise(SumExposure = sum(Exposure),
                              SumTerminations = sum(Terminations),
                              term_rates = SumTerminations / SumExposure
                            )

View(term_stats)

# 1.0.10 - plot the hazard rate summary, and view it for reasonability
attach(term_stats)

plot(y = term_rates[1:120], 
     x = ClaimDuration[1:120],
     xlab = "Claim Duration",
     ylab = "Term Rate",
     type = "l",
     main = "Hazard rates by claim duration",
     col = "blue")

# 1.0.11 - add in a smoothed line to remove some of the noise
lines(smooth(term_rates[1:120]), 
      col = "red")

detach(term_stats)

# 1.0.12 - credibility view - do you have enough termination data to perform an analysis?
cred_stats <- ctr_data %>%
                    select(Gender,
                           ClaimType,
                           IncurredAgeBucket,
                           Terminations
                            ) %>%
                    group_by(Gender,
                             ClaimType,
                             IncurredAgeBucket
                            ) %>%
                    summarise(SumTerminations = sum(Terminations)
                            )

# 1.0.13 - view the results for a judgment call on credibility
View(cred_stats)
