# Сравнение распределений в R

## 0\. Подготовительные работы

In [1]:
setwd("~/Documents/_notebooks")

In [21]:
data <- read.csv("data_compare_distr.csv")

In [22]:
library(jsonlite)
library(dplyr)
library(ggplot2)

In [4]:
dim(data)

In [23]:
str(data)

'data.frame':	131146 obs. of  36 variables:
 $ site             : int  5 5 5 5 5 5 5 5 5 5 ...
 $ is.bot           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ time             : Factor w/ 78775 levels "2016-04-26 00:00:48",..: 20354 20349 20352 20384 20385 20369 20373 20375 20414 20404 ...
 $ total            : int  1077 1077 1077 1077 1077 1077 1077 1077 1077 1077 ...
 $ max.score        : num  12.2 12.2 12.2 12.2 12.2 ...
 $ score            : num  12.2 12.2 12.2 12.2 12.2 ...
 $ types            : Factor w/ 3 levels "bn","cu","vb": 1 1 1 1 1 1 1 1 1 1 ...
 $ site.stat        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ campaign         : int  503 503 503 503 503 500 493 503 502 376 ...
 $ format           : Factor w/ 11 levels "bn1","bn2","bn20",..: 7 7 7 7 2 9 9 7 9 7 ...
 $ master           : int  5 5 5 5 5 5 5 5 5 5 ...
 $ uid              : int  NA NA NA NA NA NA NA NA NA NA ...
 $ os               : Factor w/ 37 levels "Android","AndroidPhone",..: 11 10 11 10 8 11 10 11 6 16 ...
 $ browser         

In [6]:
data <- data[sort(colnames(data))]

Посмотрим на классы переменных:

In [24]:
classes <- lapply(data, class)
print(as.character(classes))
table(as.character(classes))

 [1] "integer" "integer" "factor"  "integer" "numeric" "numeric" "factor" 
 [8] "integer" "integer" "factor"  "integer" "integer" "factor"  "factor" 
[15] "factor"  "factor"  "factor"  "factor"  "integer" "factor"  "factor" 
[22] "integer" "integer" "integer" "factor"  "integer" "integer" "numeric"
[29] "numeric" "integer" "integer" "factor"  "integer" "numeric" "factor" 
[36] "integer"



 factor integer numeric 
     14      17       5 

Посмотрим, какие переменные — `integer`:

In [8]:
(classes.int <- colnames(data.frame(subset(classes, classes == "integer"))))

Очевидно, что некоторые из них совсем не `integer`, а `factor`. Исправим это и сохраним названия переменных разных классов в соответствующие векторы:

In [9]:
print("Number of unique values in integer variables:")
lapply(data[classes.int], unique) %>%  lapply(length)

[1] "Number of unique values in integer variables:"


In [19]:
# classes[c("adsystem", "agent", "banner", "campaign", "flash",
#           "format", "rekl", "scheme", "site.stat", "stat.format", "pay.for")] <- "factor"

# classes.int <- colnames(data.frame(subset(data, classes == "integer")))
# classes.num <- colnames(data.frame(subset(data, classes == "numeric")))
# classes.fact <- colnames(data.frame(subset(data, classes == "factor")))

Факторам — факторово! Поменяем классы переменных там, где это нужно сделать, не забывая про `time`.

In [36]:
data$site <- as.factor(data$site)
data$site.stat <- as.factor(data$site.stat)
data$master <- as.factor(data$master)
data$adsystem <- as.factor(data$adsystem)
data$flash <- as.factor(data$flash)
data$campaign <- as.factor(data$campaign)
data$rekl <- as.factor(data$rekl)
data$is.bot <- as.factor(data$is.bot)
data$pay.for <- as.factor(data$pay.for)
data$adblock <- as.factor(data$adblock)
data$banner <- as.factor(data$banner)
data$scheme <- as.factor(data$scheme)
data$k <- as.factor(data$k)

In [26]:
data$time <- strptime(data$time, format = "%Y-%m-%d %H:%M:%S")

# final check:

In [37]:
(data.frame(lapply(data, class), row.names = NULL))

Unnamed: 0,site,is.bot,time,total,max.score,score,types,site.stat,campaign,format,ellip.h,pay.for,rekl.money,master.money,adsystem,flash,agent,adblock,cpm,action,k
1,factor,factor,POSIXlt,integer,numeric,numeric,factor,factor,factor,factor,⋯,factor,numeric,numeric,factor,factor,factor,factor,numeric,factor,factor
2,factor,factor,POSIXt,integer,numeric,numeric,factor,factor,factor,factor,⋯,factor,numeric,numeric,factor,factor,factor,factor,numeric,factor,factor


## 1\. Exploratory Analysis

In [38]:
head(data, n = 3)
tail(data, n = 3)

Unnamed: 0,site,is.bot,time,total,max.score,score,types,site.stat,campaign,format,ellip.h,pay.for,rekl.money,master.money,adsystem,flash,agent,adblock,cpm,action,k
1,5,1,2016-05-19 18:00:22,1077,12.16155,12.16155,bn,0,503,bn5,⋯,0,0.013,0.0054587,2,0,"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.2 Safari/537.17",,13,,
2,5,1,2016-05-19 18:00:14,1077,12.16155,12.16155,bn,0,503,bn5,⋯,0,0.013,0.0054587,2,0,"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.21 (KHTML, like Gecko) Chrome/25.0.1351.0 Safari/537.21",,13,,
3,5,1,2016-05-19 18:00:19,1077,12.16155,12.16155,bn,0,503,bn5,⋯,0,0.013,0.0054587,2,0,"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.5 (KHTML, like Gecko) YaBrowser/1.0.1084.5406 Chrome/19.0.1084.5406 Safari/536.5",,13,,


Unnamed: 0,site,is.bot,time,total,max.score,score,types,site.stat,campaign,format,ellip.h,pay.for,rekl.money,master.money,adsystem,flash,agent,adblock,cpm,action,k
131144,3,1,2016-05-24 08:29:14,15642,8.79801,8.741188,bn,0,496.0,bn1,⋯,1,0.0,0.0,2,0.0,"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.10",,2.23,,
131145,3,1,2016-05-24 08:29:33,15642,8.79801,8.741188,bn,0,493.0,bn3,⋯,1,0.0,0.0,2,0.0,Mozilla/5.0 (Windows NT 6.2; rv:5.0) Gecko/20100101 Firefox/5.0,,3.95,,
131146,3,1,2016-05-24 08:30:10,15642,8.79801,8.741188,bn,0,,bn2,⋯,0,0.015,,3,,Mozilla/5.0 (Windows NT 6.2; rv:9.0.1) Gecko/20100101 Firefox/9.0.1 BLNGBAR,,,,


In [39]:
str(data)

'data.frame':	131146 obs. of  36 variables:
 $ site             : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ is.bot           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
 $ time             : POSIXlt, format: "2016-05-19 18:00:22" "2016-05-19 18:00:14" ...
 $ total            : int  1077 1077 1077 1077 1077 1077 1077 1077 1077 1077 ...
 $ max.score        : num  12.2 12.2 12.2 12.2 12.2 ...
 $ score            : num  12.2 12.2 12.2 12.2 12.2 ...
 $ types            : Factor w/ 3 levels "bn","cu","vb": 1 1 1 1 1 1 1 1 1 1 ...
 $ site.stat        : Factor w/ 9 levels "0","11","47",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ campaign         : Factor w/ 549 levels "1","2","3","4",..: 503 503 503 503 503 500 493 503 502 376 ...
 $ format           : Factor w/ 11 levels "bn1","bn2","bn20",..: 7 7 7 7 2 9 9 7 9 7 ...
 $ master           : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ uid              : int  NA NA NA NA NA NA NA NA NA NA ...
 $ os      

In [41]:
lapply(data, summary)

ERROR: Error in vapply(seq_along(mapped), function(i) {: values must be length 1,
 but FUN(X[[3]]) result is length 0


$site
    1     2     3     4     5 
11023 47789 50000 20000  2334 

$is.bot
     0      1 
 11023 120123 

$time
                 Min.               1st Qu.                Median 
"2016-04-26 00:00:48" "2016-05-20 02:09:30" "2016-05-21 14:38:16" 
                 Mean               3rd Qu.                  Max. 
"2016-05-16 08:37:15" "2016-05-23 05:11:10" "2016-05-24 14:38:32" 

$total
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1077   15310   15640   22200   34760   47680 

$max.score
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  8.065   8.428   9.179   9.033   9.266  12.160 

$score
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  8.059   8.413   9.159   8.997   9.252  12.160 

$types
    bn     cu     vb 
121665   9452     29 

$site.stat
     0     11     47     50     55     56    100  60000  73000 
130468      3    198     10      1    110      3    348      5 

$campaign
    512     237     239     493     502     503     115      83     240     238 
  12451    8

-------

## Independent Chi-Square test

Критерий $chi^2$ Пирсона – это непараметрический метод, который позволяет оценить значимость различий между 
* фактическим (выявленным в результате исследования) количеством исходов или качественных характеристик выборки, попадающих в каждую категорию, 
* и теоретическим количеством, которое можно ожидать в изучаемых группах при справедливости нулевой гипотезы. 

Выражаясь проще, метод позволяет оценить **статистическую значимость различий двух или нескольких относительных показателей** (частот, долей).

Посмотрим, каков Хи-квадрат в действии, на переменных `flash` и `browser`. Сперва сделаем простейшую визуализацию — табличку. Получив представление о том, как объекты распределены по браузерам и наличию Flash, вызовем функцию `chisq.test`:

In [52]:
with(data, table(flash, browser))

     browser
flash 360se amigo android chrome firefox    ie  ipad iphone mailru maxthon
    0     0    11       0  34401   30607     0   128    140      9      45
    1     0     0       0    170      35     0     0      0      0       0
     browser
flash  msie opera opera_mini opera_mobi operanew qq browser safari sougou
    0  9166 18319          0          0     5294          0   6344      0
    1    22    27          0          0       79          0      0      0
     browser
flash tencent traveler ucbrowser  uran yandex
    0                0        79     9   2529
    1                0         0     0     44

In [54]:
with(data, chisq.test(flash, browser))

In chisq.test(flash, browser): Chi-squared approximation may be incorrect


	Pearson's Chi-squared test

data:  flash and browser
X-squared = 446.13, df = 13, p-value < 2.2e-16


> `p-value < 2.2e-16`

говорит о том, что связь между браузером и наличием Flash статистически значима — иными словами, существуют (статистически значимые!) различия между указанными переменными.

In [56]:
with(data, table(is.bot, browser))
with(data, chisq.test(is.bot, browser))

      browser
is.bot 360se amigo android chrome firefox    ie  ipad iphone mailru maxthon
     0     0   361     348   3022     650     0   444   1557     43       0
     1    40    18       0  38784   34409  1368   128    140     25     218
      browser
is.bot  msie opera opera_mini opera_mobi operanew qq browser safari sougou
     0   486    14          1         12      756          0     43      0
     1  9307 19896          0          0     5521          7   7342     11
      browser
is.bot tencent traveler ucbrowser  uran yandex
     0                0        12    23   3251
     1                2        79    13   2642

In chisq.test(is.bot, browser): Chi-squared approximation may be incorrect


	Pearson's Chi-squared test

data:  is.bot and browser
X-squared = 48434, df = 21, p-value < 2.2e-16


In [57]:
with(data, table(is.bot, format))
with(data, chisq.test(is.bot, format))

      format
is.bot   bn1   bn2  bn20  bn22   bn3   bn4   bn5   bn6   bn7   cu1  vb50
     0     0     0     0  3886     0     0     0     0     0  7137     0
     1 59020 20181   122     0 24454 12285   909    92   716  2315    29

In chisq.test(is.bot, format): Chi-squared approximation may be incorrect


	Pearson's Chi-squared test

data:  is.bot and format
X-squared = 108440, df = 10, p-value < 2.2e-16


In [58]:
with(data, table(is.bot, country))
with(data, chisq.test(is.bot, country))

      country
is.bot    A1    AE    AL    AM    AR    AT    AU    AZ    BA    BD    BE    BG
     0     0     0     0     4     1     0     0     1     0     0     0     0
     1     7     2    10   551     6     6     8   426     3     5    28   173
      country
is.bot    BN    BR    BY    CA    CH    CL    CN    CU    CY    CZ    DE    DK
     0     0     0    10     1     0     0     0     1     3    27    32     0
     1     4    59  3019    30     7    11    46     0    20   176   296     1
      country
is.bot    DO    DZ    EC    EE    EG    ES    EU    FI    FR    GB    GE    GR
     0     0     1     0     2     0     0     7     0     5     0     0     0
     1     9     1     9   371     1   105     4    21    82   132   256    51
      country
is.bot    GT    HK    HR    HU    ID    IE    IL    IN    IQ    IR    IS    IT
     0     0     0     0     1     0     0     0     0     0     0     0     1
     1     3    17     7    27     8   161   449    31     8    23     9   

In chisq.test(is.bot, country): Chi-squared approximation may be incorrect


	Pearson's Chi-squared test

data:  is.bot and country
X-squared = 3633.2, df = 88, p-value < 2.2e-16


In [59]:
with(data, table(is.bot, browser.lang))
with(data, chisq.test(is.bot, browser.lang))

      browser.lang
is.bot    [{ [{"type":"m"    en    es    ru    uk
     0     0            0    43     1  7077    14
     1   811        12876    18     0 96117     0

In chisq.test(is.bot, browser.lang): Chi-squared approximation may be incorrect


	Pearson's Chi-squared test

data:  is.bot and browser.lang
X-squared = 1665, df = 5, p-value < 2.2e-16
