# Project Introduction

Include Introduction Prompt

Introduce Datasets

# List of functions

**From Module 1**```R


# Select desired columns from a dataframe into a new dataframe, the numbers are the columns selected, columns can be selected individually or as a range using a colon symbol (:). New dataframe can be replaced with whatever you would like to name the dataframe.  
library(dplyr)<br>
new_dataframe = select(original_dataframe, 43, 56:123) 

# Insert a new column in an existing dataframe with values determined by another column present in the dataframe. Useful when changing a single categorical variable into multiple yes or no categories. For example, if the variable is state then the new column could show if a person lives in one state specifically. Yes is represented by a 1 and no is represented by a 0.  
library(dplyr)<br>
dataframe<- mutate(dataframe, new_column_name = ifelse(name_of_reference_column == 'value_in_reference_column', '1', '0'))

# Delete a column from a dataframe, the number with the - sign in front of it is the column that is going to be removed when the code is executed.  
dataframe = select(dataframe, -17)

# Change data type of a column to factor data type
dataframe\\$column_name<- as.factor(dataframe\\$column_name)

# Change data type of a column to numeric data type 
dataframe\\$column_name<- as.numeric(dataframe\\$column_name)

# Check all variable data types for a dataframe
str(dataframe)

# Replace null cells in a column with the column mean 
dataframe = transform(dataframe, column_name = ifelse(is.na(column_name), mean(column_name, na.rm=TRUE), column_name))

# Replace null cells in a column with a 0, if 0 is replaced by a 1 then the null will be replaced with a 1.  
dataframe\\$column_name[is.na(dataframe\\$column_name)] <- 0

# Random Forest imputation, maxiter is the number of iterations performed, ntree is the number of decision trees created. New dataframe can be replace with a name of your choosing but make sure it ends in .imp as shown below.  
library(missForest)<br>
library(randomForest)<br>
set.seed(96)<br>
new_dataframe.imp <- missForest(old_dataframe, verbose = TRUE, maxiter = 3, ntree= 20)
 

# Check imputed values and assign imputed values to a new dataframe. Replace new dataframe with a name of your choosing.  
imputed_dataframe.imp\\$ximp <br>
new_dataframe<- imputed_dataframe.imp\\$ximp 
```

**From Module 2** 
```R
# Determine Variable Type
class()

# Calculate Mean
mean()

# Calculate Median
median()

# Calculate Frequencies
table()

# Percentages is trickier
# First use the table function on your variable, then apply the below code
prop.table()

# For example 
prop.table(table(my\\$variable))

# Create Bar Graph
bar_graph()

# Create Histogram
histogram()

```

**From Module 3**
```R
# Load csv file
read.csv()

# Show content in the output
print()

# Concatenating two pieces of texts in string data type
paste()

# Look up variable dictionary
var_dict()

# Show the coloumn names of the dataframe generated by read.csv
colnames()

# Generate a dataframe by subsetting from the original dataframe
subset()

# Pearson correlation test
cor.test()

# Load module/package
library()

# Generate scatter plot using package ggplot
qplot()

# Output levels of a categorical variable
levels()

# Generate contingency table for frequency
table()

# Refresh the levels in your categorical variable after you make changes
factor()

# Pearson's Chi-squared test
chisq.test()

# Transform a table into a dataframe
data.frame()

# Generating bar plot using ggplot
ggplot() + geom_bar()

# For example
ggplot(MyDataframe, aes(x=CategoricalVariable1, y=Frequency, fill=CategoricalVariable2)) + geom_bar(stat="identity",position=position_dodge())

# Welch two sample T-Test assuming same variance
t.test()

# Generating box plot using ggplot
ggplot() + geom_boxplot()

# For example
ggplot(MyDataframe, aes(x=CategoricalVariable, y=NumericalVariable)) + 
  geom_boxplot(outlier.colour="red", outlier.shape=8,
               outlier.size=4)

# Show all unique values in a numerical variable
unique()

# ANOVA
anova(aov())

# For example
anova(aov(NumericalVariable~CategoricalVariable, data=MyDataframe))

```

**From Module 4** ```R
# Create training and testing dataframes, in this case 75 percent will be used for training and 25 percent for testing. 
library(caTools) <br>
smp_size<- floor(0.75 * nrow(dataframe_to_be_split)) <br>
set.seed(123) <br>
train_ind<- sample(seq_len(nrow(dataframe_to_be_split)), size = smp_size) <br>
training_dataframe<- dataframe_to_be_split[train_ind, ] <br>
testing_dataframe<- dataframe_to_be_split[-train_ind, ] <br>

# Oversample the minority outcome to balance the training data in an effort to improve model performance (not always necessary). In this case the minority outcome will be oversampled so that it occurs 40 percent of the time.  
library(ROSE) <br>
oversampled_dataframe<- ROSE(column_of_interest~., p = 0.4, data=testing_dataframe, seed=3)\\$data <br>

# Create a table after oversampling to view the count of the oversampled outcome. 
table(oversampled_dataframe\\$column_of_interest)<br>

# Feature selection using recursive feature elimination. 2:18 in this case is the range of the columns used to predict the outcome variable. The outcome variable in this example is column 1.   
library(e1071)<br>
library(mlbench)<br>
library(caret)<br>
library(randomForest)<br>
rfe_training <- rfeControl(functions=rfFuncs, method="cv", number=10)<br>
rfe <- rfe(oversampled_dataframe[,2:18], oversampled_dataframe[,1], sizes=c(2:18), rfeControl=rfe_training)<br>
print(rfe)<br>

# Show variable rank based on RFE
predictors(rfe)<br> 

# Display graph that highlights the most accurate number of features found using RFE. 
plot(rfe, type=c("g", "o"))<br> 

# Feature selection using Random Forest 
library(e1071)<br>
library(mlbench)<br>
library(caret)<br>
library(randomForest)<br>
t_training<- trainControl(method = "repeatedcv", number=10, repeats=3)<br>
seed<- 7<br>
metric<- "Accuracy"<br>
set.seed(seed)<br>
mtry<- sqrt(ncol(oversampled_dataframe))<br>
tunegrid<-expand.grid(.mtry=mtry)<br>

# Train the model using the oversampled dataframe
t_model<- train(column_of_interest~., data=oversampled_dataframe, method="rf", metric=metric, tuneGrid=tunegrid, trControl=t_training)<br>

# Apply the model to the testing_dataframe. Change new dataframe to a name of your choosing. 
new_dataframe<- predict(t_model, testing_dataframe)<br>

# Analyze the results of running the model on the testing dataframe using a confusion matrix. The dataframe in this example is the one created in the previous step. 
confusionMatrix(dataframe, testing_dataframe\\$column_of_interest)<br>
                 
# Visualize variable importance in Random Forest model 
variable_importance<- varImp(t_model)<br>
print(variable_importance)<br>
```<br>




# Student Input