## EDA for Ward x Complaint Type Time Series Analysis
Objectives: 
- Find out which Ward x Complaint Type is suitable for modeling using TSM
- Find obvious patterns in the data. In particular, answer what complaint types occur predominatly during which time periods.
- Construct data files to be used in TSM

In [None]:
library(forecast)
library(xts)
library(lubridate)
library(dplyr)

In [None]:
RAW_PATH = "/home/samarth/workspaces/datakind-workspace/cocUptoJuly2016.csv"

In [None]:
nas <- c(NA, 'NA', '', ' ', 'NULL')
df <- read.csv(RAW_PATH, stringsAsFactors = F,
                    na.strings = nas)


In [None]:
df$Complaint.Date <- as.Date(df$Complaint.Date, format = "%m/%d/%Y")
df$Resolution.Date <- as.Date(df$Resolution.Date, format = "%m/%d/%Y")
df$NumComplaints <- 1

# discard pre-2012 data, and post 2016 data
df <- df[df$Complaint.Date >= as.Date("01/01/2012", format = "%m/%d/%Y"), ] 
df <- df[df$Complaint.Date < as.Date("01/01/2016", format = "%m/%d/%Y"), ] 

In [None]:
head(df)

In [None]:
# plot distribution of complaints
wardComplaints <- table(df$Ward)
wardComplaints <- wardComplaints[order(-wardComplaints)]
plot(wardComplaints)

In [None]:
(wardComplaints / nrow(df)) * 100

In [None]:
plot.ward.details <- function(df, ward) {
    df <- df[df$Ward == ward, ]
    series <- xts(df$NumComplaints, df$Complaint.Date)
    series <- apply.monthly(series, FUN = sum)    
    plot(series, main="Overall Complaint Distribution")
    
    # plot top 6 complaint types    
    complaintCounts <- table(df$Complaint.Type)
    complaintCounts <- complaintCounts[order(-complaintCounts)]    
    opar <- par(mfrow=c(3,2))
    for(i in 1:6) {
        complaintType <- names(complaintCounts[i])        
        sub <- df[df$Complaint.Type == complaintType, ]        
        series <- xts(sub$NumComplaints, sub$Complaint.Date)
        series <- apply.monthly(series, FUN = sum)    
        plot(series, main=paste0("Complaint Distribution for ", complaintType))
    }
    par(opar)
    
    df$Month <- month(df$Complaint.Date)
    df$Year <- year(df$Complaint.Date)
    
    monthly <- table(Month=df$Month)    
    plot(monthly)
    monthly <- as.data.frame(monthly[order(-monthly)])
    #monthly$Month <- month.abb[monthly$Month]
    print(monthly)    
    yearly <- as.data.frame(table(Month=df$Month, Year=df$Year))
    yearly$Month <- month.abb[yearly$Month]    
}
plot.ward.details(df, "N188")

In [None]:
wards <- unique(df$Ward)
complaintTypes <- unique(df$Complaint.Type)

In [None]:
wardXComplaint <- expand.grid(Ward=wards, complaintType=complaintTypes, stringsAsFactors=F)

In [None]:
constructSeries <- function(df, ward, complaintType) {        
    df <- df[(df$Ward == ward) & (df$Complaint.Type == complaintType), ]    
    if(nrow(df) == 0) {
        return(NULL)
    }
    series <- xts(df$NumComplaints, df$Complaint.Date)
    return(apply.monthly(series, FUN = sum))
}

In [None]:
series <- apply(wardXComplaint[1:nrow(wardXComplaint), ], 1, function(row) constructSeries(df, row["Ward"], row["complaintType"]))

In [None]:
head(series)