# R for Data science - tidyverse package
import -> tidy -> transform -> visualise -> model -> communicate

R for data science -> Hadley wickham book

## Importing Data

In [None]:
## Importing and summarising using tidyverse 
NOTES:
- [] - used to extract data from dataframes or matrixes etc...
- [row, column]
- summary()

## Data manipulation - dplyr package 
- select columns from a dataframe and return another data frame. 
- use with pipe operator %>% -> creates a pipeline 
- %>% means = "and then"

In [None]:
# Data manipulation - dplyr package 
iris.sepal <- iris %>% select(c(sepal.Length, Sepal.Width))

# minus removes columns from dataframe
iris.sepal <- iris %>% select(-c(sepal.Length, Sepal.Width))

# Mutate function - create new columns based on existing columns
iris <- iris %>% mutate(Sepal.Ratio = Sepal.Length/Sepal.Width)

# Filter function - filter rows based on conditions
iris.filtered <- iris %>% filter(class == "setosa" & Sepal.Length > 5)

# grouping data and operations - groups data based on a column and then summarises it
iris.grouped <- iris %>% group_by(class) %>% summarise(mean.sepal.length = mean(Sepal.Length), sd.sepal.length = sd(Sepal.Length))



# Data Visualisation


In [None]:
# scatter plot
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
  geom_point() # scatter plot points or geom_line() for line plot or geom_bar() for bar plot or geom_histogram() for histogram

# scatter plot with colour by class
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width, colour = class)) +
  geom_point()

# facet plot - creates multiple plots based on a factor variable
ggplot(data = iris, aes(x = Sepal.Length)) +  
  geom_histogram(binwidth = 0.5) +
  facet_wrap(~class)



rprojroot library to set relative paths to project directory

# R tidyverse data wrangling

In [None]:
library(room)
library(tidyverse)

# read_csv function to read csv files and load in data 
data <- read_csv("data/my_data.csv")

# add columns names if not present in data
data <- read_csv("data/my_data.csv", col_names = c("col1", "col2", "col3"))

# view first few rows of data
head(data)

# get summary statistics of data
summary(data)

# rename columns
data %>% rename(new_col1 = col1, new_col2 = col2)

# partial matching of columns names
data %>% select(matches("col"))

# apply a function to mutate multiple columns in place
data %>% mutate(across(c(col1, col2), ~(.1000*))) # for across it needs a vector, then function to apply to each column
# will return all columns multiplied by 1000

# mutate all numeric columns
data %>% mutate(across(where(is.numeric), ~(.1000*)))

# summarise multiple columns
data %>% summarise(across(c(col1, col2), mean, na.rm = TRUE))

# pipeline example
data %>%
    filter(col1 > 10) %>%
    rename (new_col1 = col1) %>%
    mutate(across(where(is.numeric), ~(.1000*))) %>%
    summarise(across(c(new_col1, col2), mean, na.rm = TRUE
    ))

#saving a dataframe to csv
write_csv(data, "data/my_data_modified.csv")

- . is a placeholder using tidyverse, therefore will bring whatever is coming down the pipe to where the . ratheer than the front of a function
- always think whats coming down the pipe first? 

In [None]:
iris_regression_line <- iris %>%
    lm(sepal.Length ~ Sepal.Width, data = .) %>%
    coefficients()

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +   
    geom_point() +
    geom_abline(slope = iris_regression_line[2], intercept = iris_regression_line[1], colour = "blue", linetype = "dashed", size = 1)   +
    labs(title = "Sepal Length vs Sepal Width with Regression Line", x = "Sepal Width", y = "Sepal Length") +
    theme_minimal()

    # + is used to add layers to ggplot

## Markdown documnets 
- First arguement for chunk must be defined by what language e.g. r then a space
- Second arguement you call the chunk a unique name (e.g. "Setup")
- Include -> whether chunk will be included in final document
- You can run R code within a chunk, execute with play button
- Try run previous chunks a R works sequentially
- Knit