In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
library(dplyr)
library(ggplot2)

In [None]:
nbaPlayers = read.csv('../input/nba-players-data/all_seasons.csv')

In [None]:
data.19.20 = (nbaPlayers %>% filter(as.character(season) == '2019-20') %>% select(pts, ast, reb))


## Поређење основних нумеричких статистика

### Прост случајан узорак без понављања

In [None]:
est.d.srswor = function(sampled.data, n, N) {
    return(1/n * var(sampled.data) * (1 - n/N))
} 

In [None]:
srswor.1920.vs.all.seasons.sample = function(n) {
    ##################################################################
    idxs = sample(seq(nrow(nbaPlayers)), size = n, replace = FALSE)
    nba.sample = nbaPlayers[idxs, ]
    pts.sample = nba.sample$pts
    cat('Prosecan broj poena u sezoni 2019-20: ', mean(data.19.20$pts), '\n')
    cat('Procenjen prosecan broj poena u prethodnih 13 sezona: ', mean(pts.sample), '\n')
    cat('Ocena disperzije ocene je: ', est.d.srswor(sampled.data = pts.sample, n, nrow(nbaPlayers)), '\n')
    cat('#################################################\n')
    ##############################################################
    ast.sample = nba.sample$ast
    cat('Prosecan broj asistencija u sezoni 2019-20: ', mean(data.19.20$ast), '\n')
    cat('Procenjen prosecan broj asistencija u prethodnih 13 sezona: ', mean(ast.sample), '\n')
    cat('Ocena disperzije ocene je: ', est.d.srswor(sampled.data = ast.sample, n, nrow(nbaPlayers)), '\n')
    cat('#################################################\n')
    ##############################################################
    reb.sample = nba.sample$reb
    cat('Prosecan broj skokova u sezoni 2019-20: ', mean(data.19.20$reb), '\n')
    cat('Procenjen prosecan broj skokova u prethodnih 13 sezona: ', mean(reb.sample), '\n')
    cat('Ocena disperzije ocene je: ', est.d.srswor(sampled.data = reb.sample, n, nrow(nbaPlayers)), '\n')
}

In [None]:
srswor.1920.vs.all.seasons.sample(100)

### Кластер узорак код ког се примарне јединице бирају као прост случајан узорак

In [None]:
filter.clusters = function(n = 1, N = 1) {
    idxs = sample(0:N-1, n, replace = F)
    idxs
    seasons = c()
    for (i in 1:n) {
        seasons = c(seasons, paste(c(1996 + idxs[i], substr((1996 + idxs[i] + 1), 3, 4)), collapse = '-'))
    }
    return(nbaPlayers %>% filter(season %in% seasons))
}

In [None]:
#Function which returns estimated value of mean and estimated variance of esitmated value of mean
est.cluster.srswor = function(clusters, N, n) {
    ti.pts = c()
    ti.ast = c()
    ti.reb = c()
    seasons = unique(clusters$season)
    for (s in seasons) {
        filtered.cluster = clusters %>% filter(season == s) %>% select(pts, ast, reb, season)
        cluster.sample = (filtered.cluster
             %>% summarise(pts = sum(pts),
                           ast = sum(ast),
                           reb = sum(reb)))
        
        ti.pts = c(ti.pts, cluster.sample$pts)
        ti.ast = c(ti.ast, cluster.sample$ast)
        ti.reb = c(ti.reb, cluster.sample$reb)
    }
    cat('#################################################\n')
    cat('Klaster ocenjen prosecan broj poena igraca: ',  N  / n * sum(ti.pts) / nrow(nbaPlayers), '\n')
    cat('Ocena disperzije klaster ocene poena: ', 1 / (nrow(nbaPlayers)^2) * N^2 * (1 - n/N) * 1/n * 1/(n-1) * sum(ti.pts - 1/n * sum(ti.pts)),  '\n')
    cat('Klaster ocenjen prosecan broj asistencija igraca: ',  N  / n * sum(ti.ast) / nrow(nbaPlayers), '\n')
    cat('Ocena disperzije klaster ocene asistencija: ', 1 / (nrow(nbaPlayers)^2) * N^2 * (1 - n/N) * 1/n * 1/(n-1) * sum(ti.ast - 1/n * sum(ti.ast)),  '\n')
    cat('Klaster ocenjen prosecan broj skokova igraca: ',  N  / n * sum(ti.reb) / nrow(nbaPlayers), '\n')
    cat('Ocena disperzije klaster ocene skokova: ', 1 / (nrow(nbaPlayers)^2) * N^2 * (1 - n/N) * 1/n * 1/(n-1) * sum(ti.reb - 1/n * sum(ti.reb)), '\n')
}

In [None]:
n = 4
N = 2019 - 1996
clusters = filter.clusters(n, N)
options(scipen=5)
est.cluster.srswor(clusters, N, n)
mean(nbaPlayers$net_rating)

In [None]:
sys.sample.net.rating = function(n) {
    k = round(nrow(nbaPlayers)/n) 
    net.rating = nbaPlayers$net_rating[seq(sample(k, 1), nrow(nbaPlayers), k)]
    cat('Ocena srednje vrednosti plus-minus statistike ', mean(net.rating), '\n')
    return(net.rating)
}
n = 30
for (n in seq(10, 200, 10)) {
    net.rating = sys.sample.net.rating(n)
    cat(n , ' Ocena disperzije ocene je: ', est.d.srswor(sampled.data = net.rating, n, nrow(nbaPlayers)), '\n')
}

In [None]:
#source: https://www.basketball-reference.com/leagues/NBA_stats_per_game.html#stats::1
pts.ext.source = c(111.8, 111.2, 106.3,
                   105.6, 102.7, 100.0, 
                   101.0, 98.1, 96.3, 
                   99.6, 100.4, 100,
                   99.9, 98.7, 97,
                   97.2, 93.4, 95.1,
                   95.5, 94.8, 97.5,
                   91.6, 95.6, 96.9)
pts.pop = mean(pts.ext.source/12)
srswor.sample = (nbaPlayers %>% select(pts, usg_pct))[sample(nrow(nbaPlayers), 100, replace = FALSE),]

rho.hat = cov(srswor.sample$pts, srswor.sample$usg_pct)
b.hat = rho.hat * sd(srswor.sample$usg_pct) / sd(srswor.sample$pts)
x.lr.hat = mean(srswor.sample$usg_pct) + b.hat * (pts.pop - mean(srswor.sample$usg_pct))
cat('Ocena srednje vrednosti obeležja usg_pct je ', x.lr.hat)

In [None]:
ivy.league = c('Harvard', 'Cornell', 'Brown', 'Yale', 'Dartmouth', 'Columbia', 'Princeton', 'Pennsylvania')
all.players = nbaPlayers %>% group_by(player_name, college, draft_number) %>% summarise(pts = mean(pts)) 
# all.players

In [None]:
ivy.league = c('Harvard', 'Cornell', 'Brown', 'Yale', 
               'Dartmouth', 'Columbia', 'Princeton', 'Pennsylvania')
all.players = nbaPlayers %>% 
            group_by(player_name, college, draft_number) %>% 
summarise(pts = mean(pts)) 
levels(all.players$draft_number) = c(levels(all.players$draft_number), 0)
all.players$draft_number[all.players$draft_number == 'Undrafted'] = 0
 
all.players  = all.players %>% 
mutate(draft.strat = ceiling(as.numeric(as.character(draft_number)) / 10))

stratified.player.by.draft.num = all.players %>% 
group_by(draft.strat) %>%
sample_frac(.1, replace = FALSE)


N = nrow(nbaPlayers)
N.h = (all.players %>% 
       filter(draft.strat < 7) %>% 
       group_by(draft.strat) %>% 
       summarise(nrow = n()))$nrow


n.h = c()
A.n.h = c()
for (i in 0:6) {
    df = stratified.player.by.draft.num %>% filter(draft.strat == i)
    n.h = c(n.h, nrow(df))
    A.n.h = c(A.n.h,nrow(df %>% filter(college %in% ivy.league)))
} 
p.n.h = A.n.h / n.h


p.str = 1/N * sum(p.n.h * N.h)
D.p.str.hat = 1/N^2 * sum(N.h * (N.h - n.h) / (n.h - 1) * p.n.h * (1-p.n.h)) 

cat('Ocenjena proporcija strat uzorkom: ', p.str, '\n')
cat('Ocena disperzije ocene: ', D.p.str.hat)

In [None]:
mean((all.players %>% filter(college %in% ivy.league))$pts)
print(all.players %>% filter(college %in% ivy.league))


In [None]:
n = 200
srswr.sample = nbaPlayers %>% 
sample_n(size = n, replace = T) %>% 
select(player_name, pts, country)

a.n = srswr.sample %>%
group_by(country) %>%
summarise(nrow = n()) %>%
filter(country != 'USA') %>%
summarise(t = sum(nrow))

p.n = as.numeric(a.n) / n
cat('Ocena proporcije neamerkickih igraca: ', p.n , '\n')
cat('Ocena disperzije ocene: ', p.n * (1 - p.n) / (n - 1))

In [None]:
ggplot(srswr.sample, aes(country, pts)) + geom_boxplot() + coord_flip() 

In [None]:
alpha = 0.90
z = qnorm(1-alpha/2)
c(p.n - z * sqrt((N-n) / N / (n-1) * p.n * (1-p.n)) - 1/(2*n),
 p.n + z * sqrt((N-n) / N / (n-1) * p.n * (1-p.n)) + 1/(2*n))

In [None]:
#source: https://www.thehoopsgeek.com/average-nba-height/
height = c(5+5/12, 5+9/12, 5+10/12, 5+11/12, 6 + (0:11)/12, 7 + (0:6) / 12) * 30.48
num.of.players = c(13, 21, 58, 90, 282, 423, 386, 688, 591, 663, 899, 1017, 968, 1344, 1013, 906, 595, 158, 80, 41, 6, 4, 17)
avg.height = sum(height * num.of.players) / sum(num.of.players)
avg.height

In [None]:
sampled.weight.players = nbaPlayers %>% sample_n(size = 100, replace = TRUE, weight = player_weight)
ggplot(sampled.weight.players, aes(player_weight)) + geom_histogram(color = "red", fill = "yellow", show.legend = TRUE) + theme_classic()

In [None]:
total.weight = sum(nbaPlayers$player_weight)
p = sampled.weight.players$player_weight / total.weight
n = 100
pi = 1- (1 - p)^n
R.hat.oreb = sum(sampled.weight.players$oreb_pct) / sum(sampled.weight.players$player_height)
R.hat.dreb = sum(sampled.weight.players$dreb_pct) / sum(sampled.weight.players$player_height)
oreb.mean.est = avg.height * R.hat.oreb
dreb.mean.est = avg.height * R.hat.dreb
cat(c(oreb.mean.est, dreb.mean.est))

In [None]:
pi.i.j = matrix(data = 0, nrow = n, ncol = n)

for (i in 1:n) {
    for (j in 1:n) {
        pi.i.j[i,j] = pi[i] + pi[j] - 1 + (1 - p[i] - p[j])^n
        
    }
}

est.D.oreb.mean.est = avg.height^2 / (sum(height * num.of.players)^2) +
                sum((1-pi) / pi^2 * (sampled.weight.players$oreb_pct - R.hat.oreb * sampled.weight.players$player_height)^2)

for (i in 1:n) {
    for (j in 1:n) {
        if (i != j) {
            est.D.oreb.mean.est = est.D.oreb.mean.est + (pi.i.j[i, j] - pi[i]*pi[j]) / (pi[i]*pi[j]) * 
                                    (sampled.weight.players$oreb_pct[i] - R.hat.oreb * sampled.weight.players$player_height[i]) *
                                    (sampled.weight.players$oreb_pct[j] - R.hat.oreb * sampled.weight.players$player_height[j]) / pi.i.j[i, j]
        }
    }
}



est.D.dreb.mean.est = avg.height^2 / (sum(height * num.of.players)^2) +
                sum((1-pi) / pi^2 * (sampled.weight.players$dreb_pct - R.hat.dreb * sampled.weight.players$player_height)^2)

for (i in 1:n) {
    for (j in 1:n) {
        if (i != j) {
            est.D.dreb.mean.est = est.D.dreb.mean.est + (pi.i.j[i, j] - pi[i]*pi[j]) / (pi[i]*pi[j]) * 
                                    (sampled.weight.players$dreb_pct[i] - R.hat.dreb * sampled.weight.players$player_height[i]) *
                                    (sampled.weight.players$dreb_pct[j] - R.hat.dreb * sampled.weight.players$player_height[j]) / pi.i.j[i, j]
        }
    }
}


cat(est.D.oreb.mean.est, ' ', est.D.dreb.mean.est)


In [None]:
team.num.of.players = nbaPlayers %>% 
distinct(player_name, team_abbreviation) %>% 
group_by(team_abbreviation) %>% 
summarise(nrow = n())

n = sum(team.num.of.players[which(team.num.of.players$team_abbreviation %in% c('BKN', 'NJN')), ]$nrow)
team.num.of.players = rbind(team.num.of.players, data.frame(team_abbreviation = "BKN/NJN", nrow = n))
n = sum(team.num.of.players[which(team.num.of.players$team_abbreviation %in% c('VAN', 'MEM')), ]$nrow)
team.num.of.players = rbind(team.num.of.players, data.frame(team_abbreviation = "VAN/MEM", nrow = n))
n = sum(team.num.of.players[which(team.num.of.players$team_abbreviation %in% c('NOP', 'NOH', 'NOK')), ]$nrow)
team.num.of.players = rbind(team.num.of.players, data.frame(team_abbreviation = "NOP/NOH/NOK", nrow = n))
n = sum(team.num.of.players[which(team.num.of.players$team_abbreviation %in% c('CHA', 'CHH')), ]$nrow)
team.num.of.players = rbind(team.num.of.players, data.frame(team_abbreviation = "CHA/CHH", nrow = n))
n = sum(team.num.of.players[which(team.num.of.players$team_abbreviation %in% c('SEA', 'OKC')), ]$nrow)
team.num.of.players = rbind(team.num.of.players, data.frame(team_abbreviation = "SEA/OKC", nrow = n))
team.num.of.players =  team.num.of.players %>% filter(!(team_abbreviation %in% c('BKN', 'NJN', 'VAN', 'MEM', 'NOP', 'NOH', 'NOK', 'CHA', 'CHH', 'SEA', 'OKC')))

In [None]:
n = 3
N = 30
alpha = .95
xi = team.num.of.players %>% 
sample_frac(0.1, replace = FALSE) %>% 
pull(nrow)

xn.est = mean(xi)
t = qt(1-alpha/2, n-1)
c(xn.est - t * sd(xi) / sqrt(n) * sqrt(1 - n/N), xn.est + t * sd(xi) / sqrt(n) * sqrt(1 - n/N)) 