# Mainpulating data for analysis

# Review: DPYR

## R.1 Importing data

In [1]:
data("mtcars") #Download internal data set: cars data
write.csv(mtcars, "cars.csv") #Name the dataset "cars.csv" and save to current working directory
df = read.csv("cars.csv") #We often name our dataframes df_NAME

## R.2. DPLYR

### R.2.1 Import dplyr

In [2]:
#install.packages("dplyr") #Already installed!
library(dplyr) #Import DPLYR
df = as_tibble(df) #Convert Data Frame into tibble (DPLYR's special Data Frame)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



## R. 2.2 Review important functions:
- select
- filter
- arrange
- mutate
- summarise
- groupby

### 2.2.1 Select

In [3]:
df %>% #Dataframe
select(cyl, disp, hp, wt)%>% #Select column named X
head(2) #Shows only first 2 obs

cyl,disp,hp,wt
6,160,110,2.62
6,160,110,2.875


### 2.2.2 Filter 

In [4]:
df %>% 
filter(mpg>22 & mpg<35) %>% head(2)

X,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2


### 2.2.3 Arrange

In [5]:
df %>% 
arrange(mpg)%>% 
head(2)

X,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Cadillac Fleetwood,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
Lincoln Continental,10.4,8,460,215,3.0,5.424,17.82,0,0,3,4


### 2.2.4 Mutate 

In [6]:
df %>% 
mutate(mpg_10 = mpg/10) %>% head(2)

X,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,mpg_10
Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4,2.1
Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4,2.1


### 2.2.5 Summarise

In [7]:
df %>% 
summarise(mean_mpg = mean(mpg),
        median_mpg = median(mpg),
         sum_mpg = sum(mpg),
         count_mpg = n(),
         first_obs = first(mpg),
         last_obs = last(mpg),
         variance = var(mpg),
         sd = sd(mpg))

mean_mpg,median_mpg,sum_mpg,count_mpg,first_obs,last_obs,variance,sd
20.09062,19.2,642.9,32,21,21.4,36.3241,6.026948


### 2.2.6 Groupby with summarise

In [8]:
df %>% 
group_by(gear) %>% 
summarise(mpg_by_gear = mean(mpg))

gear,mpg_by_gear
3,16.10667
4,24.53333
5,21.38


# 1. Prep data for analysis

In [9]:
head(df, 5) #Show data 

X,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2


## 1.1 Basic overview of data

### 1.1.1 Show kind of data

In [10]:
str(df)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	32 obs. of  12 variables:
 $ X   : Factor w/ 32 levels "AMC Javelin",..: 18 19 5 13 14 31 7 21 20 22 ...
 $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
 $ cyl : int  6 6 4 6 8 6 8 4 4 6 ...
 $ disp: num  160 160 108 258 360 ...
 $ hp  : int  110 110 93 110 175 105 245 62 95 123 ...
 $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
 $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
 $ qsec: num  16.5 17 18.6 19.4 17 ...
 $ vs  : int  0 0 1 1 0 1 0 1 1 1 ...
 $ am  : int  1 1 1 0 0 0 0 0 0 0 ...
 $ gear: int  4 4 4 3 3 3 3 4 4 4 ...
 $ carb: int  4 4 1 1 2 1 4 2 2 4 ...


### 1.1.2 Summary statistics

In [11]:
summary(df[, 1:4])

                  X           mpg             cyl             disp      
 AMC Javelin       : 1   Min.   :10.40   Min.   :4.000   Min.   : 71.1  
 Cadillac Fleetwood: 1   1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8  
 Camaro Z28        : 1   Median :19.20   Median :6.000   Median :196.3  
 Chrysler Imperial : 1   Mean   :20.09   Mean   :6.188   Mean   :230.7  
 Datsun 710        : 1   3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0  
 Dodge Challenger  : 1   Max.   :33.90   Max.   :8.000   Max.   :472.0  
 (Other)           :26                                                  

# 2. Dummy variable

## 2.1 Create heavy dummy variable

In [12]:
#Explore the distribution of weight variable
summary(df$wt)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.513   2.581   3.325   3.217   3.610   5.424 

## 2.2 Create heavy variable =1 if weight over 4

In [13]:
df = df %>% mutate(heavy_dummy = as.numeric(wt>4))

## 2.2 Question: How do we check whether this operation worked?
- Hint: use select and filter

# 3. Mapping values

In [14]:
df %>% mutate(heavy_dummy_string = recode(heavy_dummy, "0"="Light", "1"="Heavy"))%>%
select(heavy_dummy, heavy_dummy_string)%>%head(2)

heavy_dummy,heavy_dummy_string
0,Light
0,Light


# 4. Omit na variables

In [15]:
df = df %>% na.omit()

# 5. Create cut off variables
- disp var

## 5.1 Summary stat for display variable

In [16]:
summary(df$disp)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   71.1   120.8   196.3   230.7   326.0   472.0 

## 5.2 Create cutoffs

In [17]:
cutoffs= c(seq(50, 500, by = 100))
cutoffs

### 5.3 Mutate cutoff variables

In [18]:
df = df%>% mutate(cut_variable = cut(df$disp, cutoffs, include.lowest = TRUE))
df %>% select(disp, cut_variable) %>% head(2)

disp,cut_variable
160,"(150,250]"
160,"(150,250]"
