In [1]:
library("dplyr")


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



In [2]:
# Data where the files are stored-
fpath = "specdata/"

In [3]:
## Read the data to 
df <- read.csv("specdata/002.csv")
df$ID <- NULL ## Remove the ID column. The file indicates the ID aswell
head(df)

Unnamed: 0,Date,sulfate,nitrate
1,2001-01-01,,
2,2001-01-02,,
3,2001-01-03,,
4,2001-01-04,,
5,2001-01-05,,
6,2001-01-06,,


In [4]:
# Size of the df
dim(df)

In [5]:
# A describe of the df
summary(df)

         Date         sulfate          nitrate     
 2001-01-01:   1   Min.   : 0.000   Min.   :0.000  
 2001-01-02:   1   1st Qu.: 2.380   1st Qu.:0.465  
 2001-01-03:   1   Median : 3.700   Median :0.696  
 2001-01-04:   1   Mean   : 4.461   Mean   :0.945  
 2001-01-05:   1   3rd Qu.: 5.510   3rd Qu.:1.120  
 2001-01-06:   1   Max.   :27.900   Max.   :6.440  
 (Other)   :3646   NA's   :2611     NA's   :2601   

In [6]:
## Calculate the number of missing values in each column.
apply(is.na(df), 2, sum)

In [7]:
x1 <- as.Date(df["Date"][1, 1])
x2 <- as.Date(df["Date"][2, 1])

x2 - x1

Time difference of 1 days

In [8]:
# Row slice
df["Date"][1:2, 1]

In [9]:
## Concatenate folders
file.path("hei", "pa")

In [10]:
df1 <- data.frame(a=1:3, b=2:4)
df2 <- data.frame(a=4:5, b=1:2)

print(df1)
print(df2)

  a b
1 1 2
2 2 3
3 3 4
  a b
1 4 1
2 5 2


In [11]:
colmean <- function(col) {
    return(mean(col, na.rm = TRUE))
}

dftest <- data.frame(a=1:4, b=0:3)
vtest <- c(1, 2, 3, 4)


In [12]:
## Read all csv files
csvfiles <- list.files(path="specdata/", pattern = ".csv$")
## Print the first 10
## Store each file mean in this vector
means <- c()
for (csvfile in csvfiles[1:2]) {
    # Read file to dataframe
    df <- read.csv(file.path("specdata", csvfile))
    cmean <- mean(df[["sulfate"]], na.rm=TRUE)
    means <- append(means, cmean)
    
}
print(means)
print(mean(means))

[1] 3.880701 4.460811
[1] 4.170756


### Part 1

In [13]:
pollutantmean <- function(directory, pollutant, id = 1:332, sep = ",") {
    ## 'directory' is a character vector of length 1 indicating the
    ## location of the CSV files.
    ##
    ## pollutant is a character vector of length 1 indicating the name
    ## of the pollutant for which we will calculate the mean; e
    ##
    ## id is a numeric vector specifying which files csv files should be
    ## parsed.
    
    ## Read all csvfiles to vector
    csvfiles <- list.files(path = directory, pattern = ".csv$")
    numpoints <- 0L
    totalsum <- 0L
    
    ## Loop over each file and calculate the mean in the specific column
    for (csvfile in csvfiles[id]) {
                
        ## Read file to df
        data <- read.csv(file.path(directory, csvfile), sep = sep)
        
        pollutant_data <- data[[pollutant]]
        pollutant_data <- pollutant_data[complete.cases(pollutant_data)]
        
        ## Calculate the mean in file for wanted column
        ##colmean <- mean(df[[pollutant]], na.rm = TRUE)
        
        ## Calculate the sum of the values
        #sums <- append(sums, sum(pollutant_data))        
        totalsum <- totalsum + sum(pollutant_data)
        numpoints <- numpoints + length(pollutant_data)
                            
        } 
    ## Calculate the mean of the means vector
    return(totalsum / numpoints)
    }
    
message("Dumping pollutantmean() funtion to file")
dump("pollutantmean", file.path("dumps", "pollutantmean.R"))

Dumping pollutantmean() funtion to file


### Testing the functions
Using my own generated test data

In [14]:
pollutantmean("test_specdata", pollutant = "sulfate", id = 1:2, sep = ";") # Correct answer is 5.088

### Running on the correct data

In [15]:
print(pollutantmean("specdata/", pollutant = "nitrate", 23))

[1] 1.280833


### Assignment questions

In [16]:
pollutantmean("specdata", "sulfate", 1:10)

In [17]:
pollutantmean("specdata", "nitrate", 70:72)

In [18]:
pollutantmean("specdata", "sulfate", 34)

In [19]:
pollutantmean("specdata", "nitrate")

### Part 2

In [20]:
complete <- function(directory, id = 1:332, sep = ",") {
    ## Directory where the csv files are stored.
    ##
    ## id is an integer vector used to choose which
    ## csv files should be read.
    ##
    ## Calculate the number of complete observations,
    ## both sulfate and nitrate in each csv file.
    
    ## Read all files to vector
    csvfiles <- list.files(directory, pattern = ".csv$")
    
    ## Initialize empty dataframe
    df_nobs <- data.frame(id = integer(), nobs = integer())
    
    ## Loop through the file id's
    for (i in id) {
                
        ## CSV file to data frame
        df <- read.csv(file = file.path(directory, csvfiles[i]), sep = sep)
        
        ## Calculate the number of complete cases in column 2 (sulfate) and 3 (nitrate)
        ###non_na <- colSums(!is.na(df[, 2:3]))
        ###non_na <- sum(!is.na(df[, 2:3]))
        non_na <- df[complete.cases(df), ]
        
        ## Insert number of complete obs. in output
        df_nobs <- rbind(df_nobs, list(id = i, nobs = nrow(non_na)))
                        
    }
    
    return(df_nobs)
    
}
message("Dumping complete() function to .R file")
dump("complete", file = "dumps/complete.R")

Dumping complete() function to .R file


### Testing

In [21]:
complete(directory = "test_specdata", id = 1:2, sep = ";")

Unnamed: 0,id,nobs
1,1,2
2,2,3


In [22]:
complete(directory = "specdata/", id = c(2, 4, 8, 10, 12))

Unnamed: 0,id,nobs
1,2,1041
2,4,474
3,8,192
4,10,148
5,12,96


### Assignments questions

In [23]:
cc <- complete("specdata", c(6, 10, 20, 34, 100, 200, 310))
print(cc$nobs)

[1] 228 148 124 165 104 460 232


In [24]:
cc <- complete("specdata", 54)
print(cc$nobs)

[1] 219


In [25]:
set.seed(42)
cc <- complete("specdata", 332:1)
use <- sample(332, 10)
print(cc[use, "nobs"])

 [1] 711 135  74 445 178  73  49   0 687 237


### Part 3

In [26]:
corr <- function(directory, threshold = 0, sep = ",") {
    ## directory: vector with all csv files
    ##
    ## threshold: numeric vector of lenght 1 indicating
    ## the number of complete cases it least has to be in 
    ## order to compute the correlation
    
    # Full path from wd() to the file 
    csvfiles = list.files(path = directory, full.names = TRUE)
    correlations <- c()
    
    for (csvfile in csvfiles) {
        
        # Read the file
        df <- read.csv(csvfile, sep = sep)
        # Remove all rows containing uncomplete values (NA or NaN)
        df <- df[complete.cases(df), ]
            
        if ( nrow(df) >= threshold ) {
            # Calculate correlation for each df
            c <- cor(x = df$sulfate, y = df$nitrate)
            correlations <- append(correlations, c)
        }
    }
    return(correlations)
}

### Testing

In [27]:
cr <- corr("specdata", threshold = 150)
head(cr)

In [28]:
summary(cr)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.21060 -0.05147  0.09333  0.12400  0.26840  0.76310 

In [29]:
cr <- corr("specdata", threshold = 400)
head(cr)

In [30]:
summary(cr)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.17620 -0.03109  0.10020  0.13970  0.26850  0.76310 

### Assignments questions

In [31]:
cr <- corr("specdata")                
cr <- sort(cr)                
set.seed(868)                
out <- round(cr[sample(length(cr), 5)], 4)
print(out)

[1]  0.2688  0.1127 -0.0085  0.4586  0.0447


In [32]:
cr <- corr("specdata", 129)                
cr <- sort(cr)                
n <- length(cr)                
set.seed(197)                
out <- c(n, round(cr[sample(n, 5)], 4))
print(out)

[1] 243.0000   0.2540   0.0504  -0.1462  -0.1680   0.5969


In [33]:
cr <- corr("specdata", 2000)                
n <- length(cr)                
cr <- corr("specdata", 1000)                
cr <- sort(cr)
print(c(n, round(cr, 4)))

[1]  0.0000 -0.0190  0.0419  0.1901
