Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
122 lines (84 sloc) 2.83 KB
---
title: "Kaggle_Votes"
output:
html_document:
keep_md: true
---
```{r}
library(dplyr)
library(tidyr)
library(xgboost)
library(caret)
library(knitr)
library(ggthemes)
```
### Read data
```{r}
df<- read.csv('data/DOI/elections_2017_First_round.csv')
candidats<-c("ARTHAUD","ASSELINEAU","CHEMINADE","DUPONT.AIGNAN","FILLON","HAMON","LASSALLE","LE.PEN","MACRON","MÉLENCHON","POUTOU")
best_candidates<-c("FILLON","LE.PEN","MACRON","MÉLENCHON")
```
### convert target to factor to activate classification in caret train method :
```{r}
df <- df %>% filter(!is.na(best_candidate) & best_candidate %in% best_candidates)
col_Nb<-grep("Nb",colnames(df),value = TRUE)
col_Nb_NO_ratio<-grep("^(?!.*ratio).*Nb.*$", col_Nb,value = TRUE,perl = TRUE)
df<- df %>% select(-one_of(candidats)) %>% select(-matches('EVOLUTION|^X|CODE_REG|CODE_COM|POPULATION|Votants|Exprim|Inscrits|SUPERFICIE'))%>% select(-one_of(col_Nb_NO_ratio))
df$best_candidate<-as.character(df$best_candidate)
target<-as.factor(df$best_candidate)
df<- df %>% select_if(is.numeric) %>% select(-RD,-GI13)
df$best_candidate<-target
(df %>% summarise_each(funs(sum(is.na(.)))) %>% gather())
dim(df)
df<-df %>% na.omit()
dim(df)
```
### Train Test Split :
```{r}
intrain<-createDataPartition(y=df$best_candidate,p=0.7,list=FALSE)
df_train<-df[intrain,]
df_test<-df[-intrain,]
train<-df_train %>% select(-best_candidate)
label_train<- as.numeric(df_train$best_candidate)
```
### Final Hyperparameters :
```{r}
# set up the cross-validated hyper-parameter search
xgb_grid_1 = expand.grid(
nrounds = 2,
eta = c(0.01),
max_depth = c(12),
gamma = c(1),
colsample_bytree=c(1),
min_child_weight=c(0.1),
subsample = 1
)
```
### Fit the Train
```{r}
# define training control
train_control <- trainControl(method="cv", number=2)
#train_control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
model <- train(best_candidate~.,data=df_train,trControl=train_control, method="xgbTree",tuneGrid=xgb_grid_1)
# summarize results
print(model)
dput(model$finalModel,'xgb_final_model_multiclass')
```
### Predict
```{r}
prediction<-predict(model,newdata = df_test)
df_prediction<-as.data.frame(table(prediction,df_test$best_candidate))
df_prediction<-df_prediction %>% spread(Var2,Freq)
variable_importance<-varImp(model)$importance
variable_importance$feature<-rownames(variable_importance)
variable_importance<-variable_importance %>% arrange(desc(Overall))
ggplot(variable_importance %>% top_n(20,Overall),aes(x=reorder(feature,-Overall),y=Overall))+ geom_bar(stat="identity",fill="#0c3269", colour="#0c3269")+theme_fivethirtyeight()+theme(axis.text.x=element_text(angle=45, hjust=1)) +ggtitle("XGB Features importance")
```
### Ouput Confusion Matrix :
```{r}
kable(df_prediction, format = "markdown")
```
```{r}
confusionMatrix(table(prediction,df_test$best_candidate))
```