In [2]:
library(dplyr)
library(readr)


In [18]:
base_dir <- "/Users/dfackler/Desktop/domains"

In [24]:
# issue reading file because some domain_name fields contain multiple sites separated by comma
# workaround: swap column delim to tab
# also remove {} wrapping while here
tx  <- readLines(paste(base_dir, "labels.txt", sep = "/"))
tx2 <- gsub(pattern = "},", replace = "}\t", x = tx) # handle input lines
tx2 <- gsub(pattern = "domain_names,\"label\"", replace = "domain_names\tlabel", x = tx2) # handle header row
tx2 <- gsub(pattern = "\\{", replace = "", x = tx2) # wipe open bracket
tx2 <- gsub(pattern = "}", replace = "", x = tx2) # wipe closing bracket
writeLines(tx2, con=paste(base_dir, "labels_tab.txt", sep = "/"))

In [129]:
library(stringr)

In [153]:
# load labels.txt
labels <- read_delim("/Users/dfackler/Desktop/domains/labels_tab.txt", delim = "\t", col_names = TRUE)
colnames(labels) <- c("domain_names", "label")
head(labels)


Parsed with column specification:
cols(
  domain_names = col_character(),
  label = col_logical()
)


domain_names,label
advantage-title.com,False
bynumlawnc.com,False
philiegroup.com,False
gwiktn.org,False
firstduesizeup.com,False
medlifemovement.org,False


In [154]:
# split out comma separated rows
rows_to_split <- grep(",", labels$domain_names)
split_domains <- str_split(labels$domain_names[rows_to_split], ",")
tmp_dfs <- list()
for(i in 1:length(split_domains)){
    tmp_dfs[[i]] <- data.frame(split_domains[i], 
                         labels$label[rows_to_split[i]], stringsAsFactors = FALSE)
    colnames(tmp_dfs[[i]]) <- c("domain_names", "label")
}
# collapse list of dataframes to single dataframe
split_df <- do.call("rbind", tmp_dfs)


In [155]:
# drop comma separated rows from labels and add split out rows
cat(c("rows_to_split ", length(rows_to_split), "\n"))
cat(c("split_df ", nrow(split_df), "\n"))
cat(c("labels ", nrow(labels), "\n"))
labels <- labels[-rows_to_split,]
cat(c("labels after drop ", nrow(labels), "\n"))
labels <- rbind(labels, split_df)
cat(c("labels after add ", nrow(labels), "\n"))

rows_to_split  175 
split_df  978 
labels  3715 
labels after drop  3540 
labels after add  4518 


In [54]:
# load websites in Text folder to find sites included and excluded in labels
site_files <- list.files(paste(base_dir, "Text", sep = "/"))
all_sites <- gsub(pattern = ".txt", replace = "", x = site_files)
head(all_sites)

In [29]:
length(site_files) #29,843 websites total
length(labels$domain_names %in% site_files) # 3,715 websites with label

In [30]:
######## data exploration

# look at distribution
summary(labels)
# FALSE: 3399
# TRUE: 316



 domain_names         label        
 Length:3715        Mode :logical  
 Class :character   FALSE:3399     
 Mode  :character   TRUE :316      

In [33]:
# look at TRUE names (idea to see if pattern around domain type or common names)
msp_names <- labels %>% filter(label) %>% select(domain_names)
msp_names
# nothing immediately jumps out

domain_names
connection.com
pcm.com
pegasustechsolutions.com
perryprotech.com
presidio.com
pro4ia.com
prosysis.com
provintl.com
questsys.com
questsolution.com


In [56]:
# create empty variable columns for pattern searches
labels <- labels %>% mutate(file_found = FALSE, service = 0, msp = 0, msp_full = 0, ms = 0, sp = 0, it_infr = 0)
head(labels)

# TODO: add feature for TLD

domain_names,label,service,msp,msp_full,ms,sp,it_infr,file_found
advantage-title.com,False,0,0,0,0,0,0,False
bynumlawnc.com,False,0,0,0,0,0,0,False
philiegroup.com,False,0,0,0,0,0,0,False
gwiktn.org,False,0,0,0,0,0,0,False
firstduesizeup.com,False,0,0,0,0,0,0,False
medlifemovement.org,False,0,0,0,0,0,0,False


In [64]:
# loop through files within labels that exist (not all domains in labels appear to have a corresponding file)
options(warn=-1) # turn off warnings to not get spammed with incomplete final line
for(i in 1:nrow(labels)){
    if(file.exists(paste(base_dir, "Text", paste0(labels$domain_names[i], ".txt"), sep = "/"))){
        tx  <- readLines(paste(base_dir, "Text", paste0(labels$domain_names[i], ".txt"), sep = "/"))
        tx <- tolower(tx)
        labels$file_found[i] <- TRUE
        
        # pattern searches
        # set variable to number of times pattern was found
        labels$service[i] <- length(grep(pattern = "service", x = tx))
        labels$msp[i] <- length(grep(pattern = "msp", x = tx))
        labels$msp_full[i] <- length(grep(pattern = "managed service provider", x = tx))
        labels$ms[i] <- length(grep(pattern = "managed service", x = tx))
        labels$sp[i] <- length(grep(pattern = "service provider", x = tx))
        labels$it_infr[i] <- length(grep(pattern = "it infrastructure", x = tx))
    }
}
options(warn=0)

cat(c("Total Labeled Sites: ", nrow(labels), "\n"))
cat(c("Files Found: ", sum(labels$file_found), "\n"))

Total Labeled Sites:  3715 
Files Found:  2943 


In [66]:
head(labels)
summary(labels)

domain_names,label,service,msp,msp_full,ms,sp,it_infr,file_found
advantage-title.com,False,2,0,0,0,0,0,True
bynumlawnc.com,False,0,0,0,0,0,0,False
philiegroup.com,False,3,0,0,0,0,0,True
gwiktn.org,False,0,0,0,0,0,0,True
firstduesizeup.com,False,0,0,0,0,0,0,True
medlifemovement.org,False,7,0,0,0,0,0,True


 domain_names         label            service             msp          
 Length:3715        Mode :logical   Min.   :  0.000   Min.   : 0.00000  
 Class :character   FALSE:3399      1st Qu.:  0.000   1st Qu.: 0.00000  
 Mode  :character   TRUE :316       Median :  1.000   Median : 0.00000  
                                    Mean   :  3.437   Mean   : 0.04711  
                                    3rd Qu.:  4.000   3rd Qu.: 0.00000  
                                    Max.   :189.000   Max.   :46.00000  
    msp_full             ms                sp             it_infr       
 Min.   :0.00000   Min.   : 0.0000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
 Median :0.00000   Median : 0.0000   Median :0.00000   Median :0.00000  
 Mean   :0.01561   Mean   : 0.1332   Mean   :0.04845   Mean   :0.02369  
 3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :4.00000   Max.   :20.0000   Max.   :6.0000

In [None]:
# TODO: find bags-of-words/tuples for overall websites and MSP websites
# http://rstudio-pubs-static.s3.amazonaws.com/256588_57b585da6c054349825cba46685d8464.html

In [None]:
# split to test and train
library(caret)

In [114]:
set.seed(3456)
train_index <- createDataPartition(labels$label, p = .75, 
                                  list = FALSE, 
                                  times = 1)
label_train <- labels[train_index,]
label_test <- labels[-train_index,]
nrow(label_train)
nrow(label_test)

In [117]:
# run a random forest
# TODO: tune hyperparameters
library(randomForest)

In [118]:
rf <- randomForest(
    label ~ .-domain_names-file_found,
    data=label_train,
    ntree = 100,
    type = "classification",
    importance = TRUE
)


“The response has five or fewer unique values.  Are you sure you want to do regression?”

In [119]:
rf$importance

Unnamed: 0,%IncMSE,IncNodePurity
service,0.0076776501,13.837495
msp,0.0036248722,8.141244
msp_full,0.0019209628,3.677547
ms,0.039135522,57.450839
sp,0.0006555926,2.528393
it_infr,0.0121322311,18.310261


In [120]:
# TRAIN
# predict and create new dataframe of predicted values
# add column for class prediction using .5 cutoff (revisit later)
pred <- predict(rf, newdata = label_train)
pred_train_df <- label_train %>% select(domain_names, label, file_found) %>%
            mutate(pred_val = pred) %>%
            mutate(pred_class = pred_val > .5)

In [121]:
# TEST
# predict and create new dataframe of predicted values
# add column for class prediction using .5 cutoff (revisit later)
pred <- predict(rf, newdata = label_test)
pred_test_df <- label_test %>% select(domain_names, label, file_found) %>%
            mutate(pred_val = pred) %>%
            mutate(pred_class = pred_val > .5)

In [122]:
# TRAIN
# find prediction metrics for hits and misses
xtab <- table(pred_train_df$pred_class, pred_train_df$label)
xtab
found_false <- xtab[1,1]
miss_false <- xtab[1,2]
miss_true <- xtab[2,1]
found_true <- xtab[2,2]

cat(c("Sensitivity: ", round(found_true/(found_true + miss_true), 3), "\n"))
cat(c("Specificity: ", round(found_false/(found_false + miss_false), 3), "\n"))

       
        FALSE TRUE
  FALSE  2542  110
  TRUE      8  127

Sensitivity:  0.941 
Specificity:  0.959 


In [123]:
# TEST
# find prediction metrics for hits and misses
xtab <- table(pred_test_df$pred_class, pred_test_df$label)
xtab
found_false <- xtab[1,1]
miss_false <- xtab[1,2]
miss_true <- xtab[2,1]
found_true <- xtab[2,2]

cat(c("Sensitivity: ", round(found_true/(found_true + miss_true), 3), "\n"))
cat(c("Specificity: ", round(found_false/(found_false + miss_false), 3), "\n"))

       
        FALSE TRUE
  FALSE   844   38
  TRUE      5   41

Sensitivity:  0.891 
Specificity:  0.957 


In [None]:
# First run through without finding tuples or other bags: suprisingly good

# TRAIN:
#         FALSE TRUE
#  FALSE  2542  110
#  TRUE      8  127
# Sensitivity:  0.941 
# Specificity:  0.959 

# TEST:
#         FALSE TRUE
#  FALSE   844   38
#  TRUE      5   41
# Sensitivity:  0.891 
# Specificity:  0.957 

# Pretty low bias (training error)
# Somewhat notable variance (gap between train and test error)
# both could be improved but not bad for a quick start
# method seems good and would be improved by text mining for important bags-of-words

In [105]:
# Create dataframe and features for non-labeled websites
test_files <- data.frame(domain_names = all_sites,
                        stringsAsFactors = FALSE)

# drop domain names that are in labels (add third arg FALSE to keep from swapping df to vector)
test_files <- test_files[!(test_files$domain_names %in% labels$domain_names),,FALSE]

# create empty variable columns for pattern searches
test_files <- test_files %>% mutate(file_found = FALSE, service = 0, msp = 0, msp_full = 0, 
                                    ms = 0, sp = 0, it_infr = 0)


In [106]:
head(test_files)

domain_names,file_found,service,msp,msp_full,ms,sp,it_infr
0-plus.com,False,0,0,0,0,0,0
10-spec.com,False,0,0,0,0,0,0
1001brickellbay.com,False,0,0,0,0,0,0
100clubbuffalo.org,False,0,0,0,0,0,0
101financial.com,False,0,0,0,0,0,0
1099-etc.com,False,0,0,0,0,0,0


In [92]:
length(labels$domain_names)
length(test_files$domain_names)

In [107]:
# Predict on unlabeled websites
# loop through files within labels that exist (not all domains in labels appear to have a corresponding file)
options(warn=-1) # turn off warnings to not get spammed with incomplete final line
for(i in 1:nrow(test_files)){
    if(file.exists(paste(base_dir, "Text", paste0(test_files$domain_names[i], ".txt"), sep = "/"))){
        tx  <- readLines(paste(base_dir, "Text", paste0(test_files$domain_names[i], ".txt"), sep = "/"))
        tx <- tolower(tx)
        test_files$file_found[i] <- TRUE
        
        # pattern searches
        # set variable to number of times pattern was found
        test_files$service[i] <- length(grep(pattern = "service", x = tx))
        test_files$msp[i] <- length(grep(pattern = "msp", x = tx))
        test_files$msp_full[i] <- length(grep(pattern = "managed service provider", x = tx))
        test_files$ms[i] <- length(grep(pattern = "managed service", x = tx))
        test_files$sp[i] <- length(grep(pattern = "service provider", x = tx))
        test_files$it_infr[i] <- length(grep(pattern = "it infrastructure", x = tx))
    }
}
options(warn=0)

cat(c("Total Labeled Sites: ", nrow(test_files), "\n"))
cat(c("Files Found: ", sum(test_files$file_found), "\n"))

Total Labeled Sites:  27149 
Files Found:  27149 


In [108]:
head(test_files)

domain_names,file_found,service,msp,msp_full,ms,sp,it_infr
0-plus.com,True,0,0,0,0,0,0
10-spec.com,True,8,0,0,0,0,0
1001brickellbay.com,True,0,0,0,0,0,0
100clubbuffalo.org,True,4,0,0,0,2,0
101financial.com,True,1,0,0,0,0,0
1099-etc.com,True,2,0,0,0,0,0


In [124]:
# predict and create new dataframe of predicted values
# add column for class prediction using .5 cutoff (revisit later)
pred_full <- predict(rf, newdata = test_files)
pred_full_df <- test_files %>% select(domain_names, file_found) %>%
            mutate(pred_val = pred_full) %>%
            mutate(pred_class = pred_val > .5)

In [125]:
# 369 predicted to be MSPs with initial base model
summary(pred_full_df)

 domain_names       file_found        pred_val       pred_class     
 Length:27149       Mode:logical   Min.   :0.03264   Mode :logical  
 Class :character   TRUE:27149     1st Qu.:0.03265   FALSE:26780    
 Mode  :character                  Median :0.04100   TRUE :369      
                                   Mean   :0.05446                  
                                   3rd Qu.:0.04221                  
                                   Max.   :1.00000                  