# Data Frames

## Importing Data

### 1. Select file manually 

In [None]:
# ?read.csv()

stats <- read.csv(file.choose())
stats

In [None]:

# 2. Automatically choose through Working Directory
getwd() # Determine current working directory
setwd("C:\\Users\\Rsinghx0112806\\Documents\\Development\\R Scripts")
getwd()

rm(stats)
stats
stats <- read.csv("P2-Demographic-Data.csv")
stats

In [None]:


# Exploring Data
stats
nrow(stats)   # Number of rows
ncol(stats)   # Number of columns
head(stats)   # Default head 5 rows
tail(stats, n = 10) # We can provide non default values as well
?str()  # It stands for Structure not String!
str(stats) # similarly runif() is for random number Unformly distributed. Gotcha!
summary(stats)




# ------- $ Sign -------

# $ Sign doesn't work for matrices but only for data frames. 
# In data frame only columns are named the rows are not named (unlike matrices)

stats
head(stats)
stats[3, 4]
stats[3, "Internet.users"]
stats[,"Income.Group"]
stats$Income.Group
stats$Internet.users[4]
levels(stats$Income.Group)



# Basic Operations

stats[3:10,]   # Subsetting
stats[c(2, 93),]

# [] operation on data frame
stats[1, ]
is.data.frame(stats[1, ])   # No need for drop=FALSE

stats[, 1]
is.data.frame(stats[,1])

stats[,1, drop = FALSE]
is.data.frame(stats[,1, drop = FALSE])


# Multiple columns
head(stats)
stats$Birth.rate * stats$Internet.users

# Add a new columns to data frame
stats$MyCol <- stats$Birth.rate + stats$Internet.users
head(stats)


# Values of a vector with length lower than the rows 
# in data frame will get repeated. 
stats$xyz <- 1:5 # This MUST be a multiple of exists rows else it will give error. 
head(stats, n = 12)


# Remove Columns
stats$MyCol <- NULL
stats$xyz <- NULL
head(stats)



# Filtering Data Frame

head(stats)
filter <- stats$Internet.users < 2 # Creates a vector with TRUE/FALSE values
stats[filter,]   # Only rows with filter containing TRUE will be returned


stats[stats$Birth.rate > 40, ]
stats[stats$Birth.rate > 40 & stats$Internet.users < 2, ]
stats[stats$Income.Group == "High income", ]
levels(stats$Income.Group)

stats[stats$Country.Name == "Malta", ]



# Introduction to qplot()
?qplot
# install.packages("ggplot2")
library(ggplot2)
?qplot


qplot(data=stats, x = stats$Internet.users)
qplot(data=stats, x = Internet.users) #Works without specifying the data frame name. 
qplot(data = stats, x = Income.Group, y = Birth.rate)
qplot(data = stats, x = Income.Group, y = Birth.rate, size = 10)

qplot(data = stats, x = Income.Group, y = Birth.rate, size = I(10))
qplot(data = stats, x = Income.Group, y = Birth.rate, size = I(3), color = I("blue"))
qplot(data = stats, x = Income.Group, y = Birth.rate, geom = "boxplot")



qplot(data=stats, x=Birth.rate, y=Internet.users, color=Income.Group)
qplot(data=stats, x=Internet.users, y=Birth.rate, size=I(4), color=I("red"))
qplot(data=stats, x=Internet.users, y=Birth.rate, size=I(4), color=Income.Group)





# Creating Data Frames

mydf <- data.frame(Countries_2012_Dataset, Codes_2012_Dataset, Regions_2012_Dataset)
head(mydf)

colnames(mydf) <- c("Country.Name", "Country.Code", "Region")
colnames(mydf)


rm(mydf)
mydf

# We can give name to columns while defining them, also work for cbind and rbind functions
mydf <- data.frame(Country.Name = Countries_2012_Dataset, 
                   Country.Code = Codes_2012_Dataset, 
                   Region = Regions_2012_Dataset)
head(mydf)
tail(mydf)
summary(mydf)



# Merging Data Frames

merged <- merge(stats, mydf, by.x =  "Country.Code", by.y = "Country.Code")
head(merged)

merged$Country.Name.y <- NULL
tail(merged)

colnames(merged)[2] <- "Country.Name"
str(merged)


qplot(data=merged, x=Birth.rate, y=Internet.users, size=I(4), color=Region)
# Shapes
qplot(data=merged, y=Birth.rate, x=Internet.users, size=I(3), 
      shape=I(15), color=Region)

# Transparency
qplot(data=merged, y=Birth.rate, x=Internet.users, size=I(3), 
      shape=I(19), alpha=I(0.5), color=Region)

# Title
qplot(data=merged, y=Birth.rate, x=Internet.users, 
      color=Region, size=I(3), 
      shape=I(19), alpha=I(0.5),
      main = "Internet Users Vs Birth Rate")

