In [3]:
library(readxl)
library(dplyr)


In [5]:
# Adatok beolvasása
fuel_prices <- read_excel("fuel_prices_tidy.xlsx")

In [7]:
fuel_prices <- fuel_prices %>%
  mutate(
    dummy_fovarosi_kerulet = ifelse(is_motorway == 1, 0, dummy_fovarosi_kerulet),
    dummy_kozseg = ifelse(is_motorway == 1, 0, dummy_kozseg),
    dummy_megyei_jogu_varos = ifelse(is_motorway == 1, 0, dummy_megyei_jogu_varos),
    dummy_megye_szekhely_megyei_jogu_varos = ifelse(is_motorway == 1, 0, dummy_megye_szekhely_megyei_jogu_varos),
    dummy_nagykkozseg = ifelse(is_motorway == 1, 0, dummy_nagykkozseg),
    dummy_varos = ifelse(is_motorway == 1, 0, dummy_varos),
    popper_modified = ifelse(nearest_distance_km == 0, pop_per_station, 0)
  )


Gasoline

In [8]:
# Csomagok
library(dplyr)

# Lineáris regresszió
model_full <- lm(
  Gasoline ~ Population + Dwellings + pop_per_station + popper_modified +
    is_motorway + nearest_diff_brand_km + nearest_distance_km + income + nearest_refinery_km +
     dummy_kozseg + dummy_megyei_jogu_varos +
    dummy_megye_szekhely_megyei_jogu_varos + dummy_nagykkozseg + dummy_varos +
    dummy_Mol + dummy_Shell + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol +
    dummy_Oil + dummy_Avia + dummy_Mol_Partner + dummy_Auchan +
    dummy_Volnbusz_Zrt + dummy_Hunpetrol_2018_Kft + dummy_Edo +
    dummy_Oranges_Oil_Company + dummy_Concordia_Trans_Kft + dummy_Oplus_Kft +
    dummy_TeleTank + dummy_GROVI_Kft,
  data = fuel_prices
)

# Eredmények összefoglalása
summary(model_full)



Call:
lm(formula = Gasoline ~ Population + Dwellings + pop_per_station + 
    popper_modified + is_motorway + nearest_diff_brand_km + nearest_distance_km + 
    income + nearest_refinery_km + dummy_kozseg + dummy_megyei_jogu_varos + 
    dummy_megye_szekhely_megyei_jogu_varos + dummy_nagykkozseg + 
    dummy_varos + dummy_Mol + dummy_Shell + dummy_Omv + dummy_Orlen + 
    dummy_Mobil_Petrol + dummy_Oil + dummy_Avia + dummy_Mol_Partner + 
    dummy_Auchan + dummy_Volnbusz_Zrt + dummy_Hunpetrol_2018_Kft + 
    dummy_Edo + dummy_Oranges_Oil_Company + dummy_Concordia_Trans_Kft + 
    dummy_Oplus_Kft + dummy_TeleTank + dummy_GROVI_Kft, data = fuel_prices)

Residuals:
    Min      1Q  Median      3Q     Max 
-56.379  -0.919   0.047   1.219  44.599 

Coefficients:
                                         Estimate Std. Error t value Pr(>|t|)
(Intercept)                             5.870e+02  4.373e+00 134.240  < 2e-16
Population                              1.223e-05  2.851e-05   0.429  0.668

In [9]:
library(broom)
library(dplyr)

# Regresszió (ha már lefuttattad, ezt nem kell újra futtatni)
# model_full <- lm(...)

# Koeficiensek p-érték szerint rendezve
tidy(model_full) %>%
  filter(term != "(Intercept)") %>%        # konstans kihagyása
  arrange(p.value) %>%                     # növekvő sorrendben p-érték szerint
  mutate(p_value = round(p.value, 5)) %>%  # p-érték 5 tizedesre kerekítve
  select(term, p_value) %>%
  print(n = 30)                           # minden sort kiír


[90m# A tibble: 31 × 2[39m
   term                                   p_value
   [3m[90m<chr>[39m[23m                                    [3m[90m<dbl>[39m[23m
[90m 1[39m is_motorway                            0      
[90m 2[39m dummy_Mol                              0      
[90m 3[39m dummy_Shell                            0      
[90m 4[39m dummy_Auchan                           0      
[90m 5[39m dummy_Omv                              0      
[90m 6[39m dummy_Orlen                            0      
[90m 7[39m dummy_Mobil_Petrol                     0.006[4m1[24m[4m1[24m
[90m 8[39m nearest_distance_km                    0.014[4m0[24m 
[90m 9[39m dummy_megyei_jogu_varos                0.018[4m2[24m 
[90m10[39m income                                 0.026[4m3[24m 
[90m11[39m dummy_megye_szekhely_megyei_jogu_varos 0.029[4m5[24m 
[90m12[39m dummy_Edo                              0.052[4m2[24m 
[90m13[39m nearest_diff_brand_km                

In [10]:
library(broom)
library(dplyr)

# 1) p-értékek kinyerése és csak a dummy_* változók szűrése
coef_df <- tidy(model_full) %>%
  filter(term != "(Intercept)") %>%
  mutate(is_dummy = grepl("^dummy_", term))

# --- add meg hányat dobjunk a legkevésbé szignifikáns dummy-k közül:
n <- 5

# 2) n db legkevésbé szignifikáns dummy_* változó kiválasztása
drop_vars <- coef_df %>%
  filter(is_dummy) %>%
  arrange(desc(p.value)) %>%
  slice(seq_len(min(n(), n))) %>%
  pull(term)

cat("Ezeket a dummy-kat dobjuk:\n")
print(drop_vars)

# 3) Új, szűkített modell *csak* ezeket a dummy-kat kihagyva
full_formula <- formula(model_full)
vars_full    <- attr(terms(full_formula), "term.labels")
vars_reduced <- setdiff(vars_full, drop_vars)

formula_reduced <- as.formula(
  paste("Gasoline ~", paste(vars_reduced, collapse = " + "))
)

model_reduced <- lm(formula_reduced, data = fuel_prices)

# 4) Együttes szignifikancia teszt (F-próba)
anova_test <- anova(model_reduced, model_full)
print(anova_test)




Ezeket a dummy-kat dobjuk:
[1] "dummy_Concordia_Trans_Kft" "dummy_kozseg"             
[3] "dummy_GROVI_Kft"           "dummy_nagykkozseg"        
[5] "dummy_Oil"                
Analysis of Variance Table

Model 1: Gasoline ~ Population + Dwellings + pop_per_station + popper_modified + 
    is_motorway + nearest_diff_brand_km + nearest_distance_km + 
    income + nearest_refinery_km + dummy_megyei_jogu_varos + 
    dummy_megye_szekhely_megyei_jogu_varos + dummy_varos + dummy_Mol + 
    dummy_Shell + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + 
    dummy_Avia + dummy_Mol_Partner + dummy_Auchan + dummy_Volnbusz_Zrt + 
    dummy_Hunpetrol_2018_Kft + dummy_Edo + dummy_Oranges_Oil_Company + 
    dummy_Oplus_Kft + dummy_TeleTank
Model 2: Gasoline ~ Population + Dwellings + pop_per_station + popper_modified + 
    is_motorway + nearest_diff_brand_km + nearest_distance_km + 
    income + nearest_refinery_km + dummy_kozseg + dummy_megyei_jogu_varos + 
    dummy_megye_szekhely_megyei_jogu_va

In [11]:
# 4) Együttes szignifikancia teszt (F-próba)
anova_test <- anova(model_reduced, model_full)
print(anova_test)

Analysis of Variance Table

Model 1: Gasoline ~ Population + Dwellings + pop_per_station + popper_modified + 
    is_motorway + nearest_diff_brand_km + nearest_distance_km + 
    income + nearest_refinery_km + dummy_megyei_jogu_varos + 
    dummy_megye_szekhely_megyei_jogu_varos + dummy_varos + dummy_Mol + 
    dummy_Shell + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + 
    dummy_Avia + dummy_Mol_Partner + dummy_Auchan + dummy_Volnbusz_Zrt + 
    dummy_Hunpetrol_2018_Kft + dummy_Edo + dummy_Oranges_Oil_Company + 
    dummy_Oplus_Kft + dummy_TeleTank
Model 2: Gasoline ~ Population + Dwellings + pop_per_station + popper_modified + 
    is_motorway + nearest_diff_brand_km + nearest_distance_km + 
    income + nearest_refinery_km + dummy_kozseg + dummy_megyei_jogu_varos + 
    dummy_megye_szekhely_megyei_jogu_varos + dummy_nagykkozseg + 
    dummy_varos + dummy_Mol + dummy_Shell + dummy_Omv + dummy_Orlen + 
    dummy_Mobil_Petrol + dummy_Oil + dummy_Avia + dummy_Mol_Partner + 
    dummy_

In [16]:
fuel_prices <- fuel_prices %>%
  mutate(
    log_Population = log(Population),
    log_Dwellings = log(Dwellings),
    log_pop_per_station = log1p(pop_per_station),
    log_popper_modified = log1p(popper_modified),
    log_nearest_diff_brand_km = log1p(nearest_diff_brand_km),
    log_nearest_distance_km = log1p(nearest_distance_km),
    log_income = log(income),
    log_nearest_refinery_km = log(nearest_refinery_km)
  )


In [35]:
# 1. kategória: biztosan hasznos változók
sure_useful_vars <- c(
  "is_motorway",
  "dummy_Mol",
  "dummy_Shell",
  "dummy_Auchan",
  "dummy_Omv",
  "dummy_Orlen",
  "dummy_Mobil_Petrol",
  "dummy_nagykkozseg",
  "dummy_kozseg",
  "dummy_Edo",
  "dummy_Oplus_Kft"
)

# 2. kategória: népsűrűséghez / kutasűrűséghez kapcsolódó változók
density_vars <- c(
  "pop_per_station",
  "popper_modified",
  "log_pop_per_station",
  "log_popper_modified"
)

# 3. kategória: versenytávolság / térbeli elhelyezkedés változói
competition_vars <- c(
  "log_nearest_diff_brand_km",
  "log_nearest_distance_km",
  "nearest_diff_brand_km",
  "nearest_distance_km"
)

# 4. kategória: demográfiai változók
demographic_vars <- c(
  "Population",
  "Dwellings",
  "log_Population",
  "log_Dwellings"
)

# 5. kategória: jövedelmi változók
income_vars <- c(
  "income",
  "log_income"
)

# 6. kategória: haszontalan / nem szignifikáns változók
useless_vars <- c(
  "dummy_Concordia_Trans_Kft",
  "dummy_GROVI_Kft",
  "dummy_Oil",
  "dummy_Mol_Partner",
  "dummy_megye_szekhely_megyei_jogu_varos",
  "dummy_megyei_jogu_varos"
)

# 7. kategória: finomító közelsége / logisztikai tényezők
refinery_vars <- c(
  "nearest_refinery_km",
  "log_nearest_refinery_km"
)

# 8. kategória: egyéb változók
other_vars <- c(
  "dummy_varos",
  "dummy_Oranges_Oil_Company",
  "dummy_TeleTank",
  "dummy_Avia",
  "dummy_Volnbusz_Zrt",
  "dummy_Hunpetrol_2018_Kft"
)










In [25]:
formula_sure <- as.formula(
  paste("Gasoline ~", paste(sure_useful_vars, collapse = " + "))
)

# Modell futtatása
model_sure <- lm(formula_sure, data = fuel_prices)

# Eredmény összefoglalása
summary(model_sure)


Call:
lm(formula = formula_sure, data = fuel_prices)

Residuals:
    Min      1Q  Median      3Q     Max 
-59.122  -0.485   0.248   0.600  45.208 

Coefficients:
                   Estimate Std. Error  t value Pr(>|t|)    
(Intercept)        577.7350     0.4434 1302.925  < 2e-16 ***
is_motorway         42.2865     0.7177   58.920  < 2e-16 ***
dummy_Mol            6.1786     0.5290   11.679  < 2e-16 ***
dummy_Shell          5.9036     0.6346    9.303  < 2e-16 ***
dummy_Auchan        -9.2481     1.4753   -6.268 5.11e-10 ***
dummy_Omv            4.5654     0.6414    7.118 1.90e-12 ***
dummy_Orlen          4.2658     0.6683    6.383 2.49e-10 ***
dummy_Mobil_Petrol  -3.0012     0.9845   -3.048  0.00235 ** 
dummy_nagykkozseg    2.4066     1.0016    2.403  0.01642 *  
dummy_kozseg         1.4860     0.6156    2.414  0.01594 *  
dummy_Edo           -4.4359     2.2107   -2.007  0.04502 *  
dummy_Oplus_Kft      5.0173     2.5423    1.974  0.04867 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 

In [20]:
# Feltételezzük, hogy a kategórialisták már léteznek a környezetben:
# sure_useful_vars, density_vars, competition_vars, demographic_vars,
# income_vars, refinery_vars, other_vars
# (És hogy a referencia-dummyk: dummy_fovarosi_kerulet és dummy_Other nincsenek a listákban.)

library(dplyr)
library(purrr)
library(tibble)

n <- 20

# 0) Biztonsági ellenőrzés: minden változó egyszer szerepeljen
all_vars_cat <- c(sure_useful_vars, density_vars, competition_vars,
                  demographic_vars, income_vars, refinery_vars, other_vars)
dups <- all_vars_cat[duplicated(all_vars_cat)]
if (length(dups) > 0) stop("Duplikált változók a kategóriák között: ", paste(unique(dups), collapse = ", "))

# 1) Kiválasztási szabályoknak megfelelő választékok összeállítása
#    - Ezeknél: legfeljebb 1 változó → {Ø} ∪ {egyenkénti}
pick0_or_1 <- function(v) c(list(character(0)), as.list(v))

density_choices     <- pick0_or_1(density_vars)
competition_choices <- pick0_or_1(competition_vars)
demographic_choices <- pick0_or_1(demographic_vars)
income_choices      <- pick0_or_1(income_vars)
refinery_choices    <- pick0_or_1(refinery_vars)

#    - other_vars: összes részhalmaz (Ø is)
power_set <- function(v) {
  if (length(v) == 0) return(list(character(0)))
  unlist(lapply(0:length(v), function(k) combn(v, k, simplify = FALSE)), recursive = FALSE)
}
other_subsets <- power_set(other_vars)

# 2) Végigmegyünk az összes kombináción és futtatjuk a modelleket
results <- vector("list", length(density_choices) *
                            length(competition_choices) *
                            length(demographic_choices) *
                            length(income_choices) *
                            length(refinery_choices) *
                            length(other_subsets))

ix <- 1L

# Opcionális: egyszerű progress jelző
total <- length(results)
pb <- utils::txtProgressBar(min = 0, max = total, style = 3)

for (d in density_choices) {
  for (c in competition_choices) {
    for (g in demographic_choices) {
      for (inc in income_choices) {
        for (r in refinery_choices) {
          for (o in other_subsets) {

            sel_vars <- c(sure_useful_vars, d, c, g, inc, r, o)
            # Ha véletlenül üres (elvileg nem lehet, mert sure_useful mindig van), védjük:
            if (length(sel_vars) == 0) sel_vars <- "1"

            frm <- as.formula(paste("Gasoline ~", paste(sel_vars, collapse = " + ")))

            fit <- try(lm(frm, data = fuel_prices), silent = TRUE)
            if (inherits(fit, "try-error")) {
              # Ha valamiért elhasal egy kombináció, jelöljük NA-val (ritka)
              results[[ix]] <- tibble(
                BIC = NA_real_, AIC = NA_real_, n_vars = length(sel_vars),
                density_pick     = ifelse(length(d)==0, "(none)", paste(d, collapse="+")),
                competition_pick = ifelse(length(c)==0, "(none)", paste(c, collapse="+")),
                demographic_pick = ifelse(length(g)==0, "(none)", paste(g, collapse="+")),
                income_pick      = ifelse(length(inc)==0, "(none)", paste(inc, collapse="+")),
                refinery_pick    = ifelse(length(r)==0, "(none)", paste(r, collapse="+")),
                other_pick       = ifelse(length(o)==0, "(none)", paste(o, collapse="+")),
                formula_str      = paste("Gasoline ~", paste(sel_vars, collapse = " + "))
              )
            } else {
              results[[ix]] <- tibble(
                BIC = BIC(fit),
                AIC = AIC(fit),
                n_vars = length(sel_vars),
                density_pick     = ifelse(length(d)==0, "(none)", paste(d, collapse="+")),
                competition_pick = ifelse(length(c)==0, "(none)", paste(c, collapse="+")),
                demographic_pick = ifelse(length(g)==0, "(none)", paste(g, collapse="+")),
                income_pick      = ifelse(length(inc)==0, "(none)", paste(inc, collapse="+")),
                refinery_pick    = ifelse(length(r)==0, "(none)", paste(r, collapse="+")),
                other_pick       = ifelse(length(o)==0, "(none)", paste(o, collapse="+")),
                formula_str      = paste("Gasoline ~", paste(sel_vars, collapse = " + "))
              )
            }

            ix <- ix + 1L
            if ((ix - 1L) %% 250 == 0) utils::setTxtProgressBar(pb, ix - 1L)
          }
        }
      }
    }
  }
}
close(pb)

# 3) Eredmények összefűzése és Top-15 BIC szerint
res_df <- bind_rows(results)

# Ha bárhol NA lett (ritka), dobjuk:
res_df <- res_df %>% filter(!is.na(BIC))

top15 <- res_df %>%
  arrange(BIC, AIC, n_vars) %>%   # elsődlegesen BIC, majd AIC, végül kevesebb változó előnyben
  slice(1:n) %>%
  select(
    BIC, AIC, n_vars,
    density_pick, competition_pick, demographic_pick,
    income_pick, refinery_pick, other_pick,
    formula_str
  )

# 4) Top-15 megjelenítése
print(top15, n = n)


[90m# A tibble: 20 × 10[39m
     BIC   AIC n_vars density_pick competition_pick demographic_pick income_pick
   [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m  [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m        [3m[90m<chr>[39m[23m            [3m[90m<chr>[39m[23m            [3m[90m<chr>[39m[23m      
[90m 1[39m [4m7[24m781. [4m7[24m720.     10 (none)       (none)           (none)           (none)     
[90m 2[39m [4m7[24m784. [4m7[24m718.     11 (none)       (none)           (none)           (none)     
[90m 3[39m [4m7[24m784. [4m7[24m718.     11 (none)       (none)           (none)           (none)     
[90m 4[39m [4m7[24m785. [4m7[24m718.     11 (none)       (none)           (none)           (none)     
[90m 5[39m [4m7[24m785. [4m7[24m719.     11 (none)       (none)           (none)           (none)     
[90m 6[39m [4m7[24m785. [4m7[24m719.     11 (none)       log_nearest_dis… (none)           (none)     
[90m 7[39m [4m7[2

In [33]:
library(rsample)
library(dplyr)
library(purrr)
library(tibble)
library(yardstick)

# Feltételezzük: res_df már létezik és tartalmazza a 36k modell BIC/AIC + formula_str mezőit

# Top 50 BIC szerint
top50 <- res_df %>%
  arrange(BIC) %>%
  slice(1:50)

# 5-fold CV a TELJES adathalmazon (nincs külön train/test)
set.seed(120)
cv5 <- vfold_cv(fuel_prices, v = 5)

# Segédfüggvény: átlagos out-of-sample R^2 egy formulára 5-fold CV-ben
cv_r2 <- function(formula_str, cv_obj, data) {
  fold_r2 <- map_dbl(cv_obj$splits, function(s) {
    train_fold <- analysis(s)
    valid_fold <- assessment(s)

    fit <- lm(as.formula(formula_str), data = train_fold)
    preds <- predict(fit, newdata = valid_fold)
    yardstick::rsq_trad_vec(truth = valid_fold$Gasoline, estimate = preds)
  })
  mean(fold_r2)
}

# Futtatás az 50 legjobb modellre
cv_results <- map_dfr(seq_len(nrow(top50)), function(i) {
  fstr <- top50$formula_str[i]
  r2cv <- cv_r2(fstr, cv5, fuel_prices)
  tibble(
    model_rank_BIC = i,
    mean_cv_r2 = r2cv,
    BIC = top50$BIC[i],
    AIC = top50$AIC[i],
    n_vars = top50$n_vars[i],
    density_pick     = top50$density_pick[i],
    competition_pick = top50$competition_pick[i],
    demographic_pick = top50$demographic_pick[i],
    income_pick      = top50$income_pick[i],
    refinery_pick    = top50$refinery_pick[i],
    other_pick       = top50$other_pick[i],
    formula_str      = fstr
  )
})

# 10 legjobb modell out-of-sample R^2 alapján
top10_cv <- cv_results %>%
  arrange(desc(mean_cv_r2), BIC, AIC, n_vars) %>%
  slice(1:10)

print(top10_cv, n = 10)


[90m# A tibble: 10 × 12[39m
   model_rank_BIC mean_cv_r2   BIC   AIC n_vars density_pick    competition_pick
            [3m[90m<int>[39m[23m      [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m  [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m           [3m[90m<chr>[39m[23m           
[90m 1[39m             14      0.765 [4m7[24m787. [4m7[24m716.     12 (none)          (none)          
[90m 2[39m             16      0.765 [4m7[24m787. [4m7[24m716.     12 (none)          (none)          
[90m 3[39m             31      0.765 [4m7[24m788. [4m7[24m717.     12 (none)          (none)          
[90m 4[39m             26      0.765 [4m7[24m788. [4m7[24m717.     12 log_popper_mod… (none)          
[90m 5[39m             25      0.765 [4m7[24m788. [4m7[24m717.     12 (none)          log_nearest_dis…
[90m 6[39m             45      0.765 [4m7[24m789. [4m7[24m718.     12 (none)          (none)          
[90m 7[39m         

In [34]:
# 10 legjobb modell out-of-sample R^2 alapján
top10_cv <- cv_results %>%
  arrange(desc(mean_cv_r2), BIC, AIC, n_vars) %>%
  slice(1:20)

top10_cv

model_rank_BIC,mean_cv_r2,BIC,AIC,n_vars,density_pick,competition_pick,demographic_pick,income_pick,refinery_pick,other_pick,formula_str
<int>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
14,0.7654157,7787.257,7716.101,12,(none),(none),(none),(none),(none),dummy_Avia+dummy_Oplus_Kft,Gasoline ~ is_motorway + dummy_Mol + dummy_Shell + dummy_Auchan + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + dummy_nagykkozseg + dummy_kozseg + dummy_Edo + dummy_Avia + dummy_Oplus_Kft
16,0.7652157,7787.368,7716.212,12,(none),(none),(none),(none),(none),dummy_Oplus_Kft+dummy_Hunpetrol_2018_Kft,Gasoline ~ is_motorway + dummy_Mol + dummy_Shell + dummy_Auchan + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + dummy_nagykkozseg + dummy_kozseg + dummy_Edo + dummy_Oplus_Kft + dummy_Hunpetrol_2018_Kft
31,0.7651112,7788.45,7717.294,12,(none),(none),(none),(none),(none),dummy_Volnbusz_Zrt+dummy_Oplus_Kft,Gasoline ~ is_motorway + dummy_Mol + dummy_Shell + dummy_Auchan + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + dummy_nagykkozseg + dummy_kozseg + dummy_Edo + dummy_Volnbusz_Zrt + dummy_Oplus_Kft
26,0.7650455,7787.883,7716.727,12,log_popper_modified,(none),(none),(none),(none),dummy_Oplus_Kft,Gasoline ~ is_motorway + dummy_Mol + dummy_Shell + dummy_Auchan + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + dummy_nagykkozseg + dummy_kozseg + dummy_Edo + log_popper_modified + dummy_Oplus_Kft
25,0.7650322,7787.79,7716.635,12,(none),log_nearest_distance_km,(none),(none),(none),dummy_Oplus_Kft,Gasoline ~ is_motorway + dummy_Mol + dummy_Shell + dummy_Auchan + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + dummy_nagykkozseg + dummy_kozseg + dummy_Edo + log_nearest_distance_km + dummy_Oplus_Kft
45,0.7650248,7789.383,7718.228,12,(none),(none),(none),(none),(none),dummy_Oranges_Oil_Company+dummy_Oplus_Kft,Gasoline ~ is_motorway + dummy_Mol + dummy_Shell + dummy_Auchan + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + dummy_nagykkozseg + dummy_kozseg + dummy_Edo + dummy_Oranges_Oil_Company + dummy_Oplus_Kft
50,0.765015,7789.507,7718.351,12,(none),(none),(none),(none),(none),dummy_TeleTank+dummy_Oplus_Kft,Gasoline ~ is_motorway + dummy_Mol + dummy_Shell + dummy_Auchan + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + dummy_nagykkozseg + dummy_kozseg + dummy_Edo + dummy_TeleTank + dummy_Oplus_Kft
32,0.7648634,7788.539,7717.383,12,(none),nearest_distance_km,(none),(none),(none),dummy_Oplus_Kft,Gasoline ~ is_motorway + dummy_Mol + dummy_Shell + dummy_Auchan + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + dummy_nagykkozseg + dummy_kozseg + dummy_Edo + nearest_distance_km + dummy_Oplus_Kft
40,0.7648405,7789.203,7718.048,12,(none),(none),(none),log_income,(none),dummy_Oplus_Kft,Gasoline ~ is_motorway + dummy_Mol + dummy_Shell + dummy_Auchan + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + dummy_nagykkozseg + dummy_kozseg + dummy_Edo + log_income + dummy_Oplus_Kft
46,0.7648224,7789.409,7718.253,12,(none),(none),(none),income,(none),dummy_Oplus_Kft,Gasoline ~ is_motorway + dummy_Mol + dummy_Shell + dummy_Auchan + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol + dummy_nagykkozseg + dummy_kozseg + dummy_Edo + income + dummy_Oplus_Kft


Diff

In [32]:
fuel_prices <- fuel_prices %>%
  mutate(
    Diff = Diesel - Gasoline
  )

In [33]:
# Lineáris regresszió
model_full <- lm(
  Diff ~ is_motorway + dummy_kozseg + dummy_megyei_jogu_varos +
    dummy_megye_szekhely_megyei_jogu_varos + dummy_nagykkozseg + dummy_varos +
    dummy_Mol + dummy_Shell + dummy_Omv + dummy_Orlen + dummy_Mobil_Petrol +
    dummy_Oil + dummy_Avia + dummy_Mol_Partner + dummy_Auchan +
    dummy_Volnbusz_Zrt + dummy_Hunpetrol_2018_Kft + dummy_Edo +
    dummy_Oranges_Oil_Company + dummy_Concordia_Trans_Kft + dummy_Oplus_Kft +
    dummy_TeleTank + dummy_GROVI_Kft,
  data = fuel_prices
)

# Eredmények összefoglalása
summary(model_full)


Call:
lm(formula = Diff ~ is_motorway + dummy_kozseg + dummy_megyei_jogu_varos + 
    dummy_megye_szekhely_megyei_jogu_varos + dummy_nagykkozseg + 
    dummy_varos + dummy_Mol + dummy_Shell + dummy_Omv + dummy_Orlen + 
    dummy_Mobil_Petrol + dummy_Oil + dummy_Avia + dummy_Mol_Partner + 
    dummy_Auchan + dummy_Volnbusz_Zrt + dummy_Hunpetrol_2018_Kft + 
    dummy_Edo + dummy_Oranges_Oil_Company + dummy_Concordia_Trans_Kft + 
    dummy_Oplus_Kft + dummy_TeleTank + dummy_GROVI_Kft, data = fuel_prices)

Residuals:
     Min       1Q   Median       3Q      Max 
-28.4759  -0.1303   0.1021   0.3592  21.9933 

Coefficients:
                                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)                             5.28685    0.33424  15.818  < 2e-16 ***
is_motorway                            -0.14587    0.36021  -0.405 0.685571    
dummy_kozseg                           -0.16059    0.33544  -0.479 0.632209    
dummy_megyei_jogu_varos                 0.55124    0.

In [34]:
library(broom)
library(dplyr)

# Koeficiensek p-érték szerint rendezve
tidy(model_full) %>%
  filter(term != "(Intercept)") %>%        # konstans kihagyása
  arrange(desc(p.value)) %>%                     # növekvő sorrendben p-érték szerint
  mutate(p_value = round(p.value, 5)) %>%  # p-érték 5 tizedesre kerekítve
  select(term, p_value) %>%
  print(n = 30)       

[90m# A tibble: 23 × 2[39m
   term                                   p_value
   [3m[90m<chr>[39m[23m                                    [3m[90m<dbl>[39m[23m
[90m 1[39m dummy_Oranges_Oil_Company              0.985  
[90m 2[39m dummy_Avia                             0.974  
[90m 3[39m dummy_Mol_Partner                      0.918  
[90m 4[39m is_motorway                            0.686  
[90m 5[39m dummy_kozseg                           0.632  
[90m 6[39m dummy_megye_szekhely_megyei_jogu_varos 0.618  
[90m 7[39m dummy_TeleTank                         0.599  
[90m 8[39m dummy_nagykkozseg                      0.457  
[90m 9[39m dummy_varos                            0.235  
[90m10[39m dummy_megyei_jogu_varos                0.196  
[90m11[39m dummy_Hunpetrol_2018_Kft               0.128  
[90m12[39m dummy_Edo                              0.095[4m2[24m 
[90m13[39m dummy_Shell                            0.048[4m7[24m 
[90m14[39m dummy_GROVI_Kft       

In [40]:
library(estimatr)

# Biztos dummy változók listája
sure_useful_vars <- c(
  "dummy_Hunpetrol_2018_Kft",
  "dummy_Avia",
  "dummy_Edo",
  "dummy_Oplus_Kft",
  "dummy_nagykkozseg",
  "dummy_Mobil_Petrol",
  "dummy_Orlen",
  "dummy_Auchan",
  "dummy_Omv",
  "dummy_Shell",
  "dummy_Mol",
  "is_motorway"
)

# Formula felépítése
f_str <- paste("Diff ~", paste(useful_vars, collapse = " + "))
f_ml  <- as.formula(f_str)

# Robusztus lineáris regresszió (HC standard hibák)
m_log_gas <- lm(f_ml, data = fuel_prices)

summary(m_log_gas)



Call:
lm(formula = f_ml, data = fuel_prices)

Residuals:
     Min       1Q   Median       3Q      Max 
-29.1141   0.0033   0.0717   0.5779  21.6905 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)                5.92188    0.41303  14.338  < 2e-16 ***
dummy_GROVI_Kft           -3.09495    1.19900  -2.581 0.009963 ** 
dummy_Oil                  1.59070    0.52851   3.010 0.002670 ** 
dummy_Volnbusz_Zrt        -4.03243    1.01146  -3.987 7.11e-05 ***
dummy_Mol                  0.82286    0.19213   4.283 2.00e-05 ***
dummy_Orlen                1.00748    0.26441   3.810 0.000146 ***
dummy_Concordia_Trans_Kft  4.32834    1.09096   3.967 7.70e-05 ***
dummy_Auchan              -3.58701    0.62570  -5.733 1.25e-08 ***
dummy_Mobil_Petrol         2.03094    0.40314   5.038 5.44e-07 ***
dummy_Oplus_Kft           -7.83536    1.09595  -7.149 1.53e-12 ***
dummy_Omv                  2.15620    0.24610   8.762  < 2e-16 ***
log_nearest_refinery_km   -0.02

In [46]:
# hasznos változók
useful_vars <- c(
  "dummy_GROVI_Kft",
  "dummy_Oil",
  "dummy_Volnbusz_Zrt",
  "dummy_Mol",
  "dummy_Orlen",
  "dummy_Concordia_Trans_Kft",
  "dummy_Auchan",
  "dummy_Mobil_Petrol",
  "dummy_Oplus_Kft",
  "dummy_Omv"
)

# egyéb (other) változók
other_vars <- c(
  "dummy_varos",
  "dummy_megyei_jogu_varos",
  "dummy_Hunpetrol_2018_Kft",
  "dummy_Edo",
  "dummy_Shell"
)

# haszontalan változók
useless_vars <- c(
  "dummy_Oranges_Oil_Company",
  "dummy_Avia",
  "dummy_Mol_Partner",
  "is_motorway",
  "dummy_kozseg",
  "dummy_megye_szekhely_megyei_jogu_varos",
  "dummy_TeleTank",
  "dummy_nagykkozseg"
)

# 2. kategória: népsűrűséghez / kutasűrűséghez kapcsolódó változók
density_vars <- c(
  "pop_per_station",
  "popper_modified",
  "log_pop_per_station",
  "log_popper_modified"
)

# 3. kategória: versenytávolság / térbeli elhelyezkedés változói
competition_vars <- c(
  "log_nearest_diff_brand_km",
  "log_nearest_distance_km",
  "nearest_diff_brand_km",
  "nearest_distance_km"
)

# 4. kategória: demográfiai változók
demographic_vars <- c(
  "Population",
  "Dwellings",
  "log_Population",
  "log_Dwellings"
)

# 5. kategória: jövedelmi változók
income_vars <- c(
  "income",
  "log_income"
)

# 7. kategória: finomító közelsége / logisztikai tényezők
refinery_vars <- c()





In [47]:
# Feltételezzük, hogy a kategórialisták már léteznek a környezetben:
# sure_useful_vars, density_vars, competition_vars, demographic_vars,
# income_vars, refinery_vars, other_vars
# (És hogy a referencia-dummyk: dummy_fovarosi_kerulet és dummy_Other nincsenek a listákban.)

library(dplyr)
library(purrr)
library(tibble)

n <- 20

# 0) Biztonsági ellenőrzés: minden változó egyszer szerepeljen
all_vars_cat <- c(useful_vars, density_vars, competition_vars,
                  demographic_vars, income_vars, refinery_vars, other_vars)
dups <- all_vars_cat[duplicated(all_vars_cat)]
if (length(dups) > 0) stop("Duplikált változók a kategóriák között: ", paste(unique(dups), collapse = ", "))

# 1) Kiválasztási szabályoknak megfelelő választékok összeállítása
#    - Ezeknél: legfeljebb 1 változó → {Ø} ∪ {egyenkénti}
pick0_or_1 <- function(v) c(list(character(0)), as.list(v))

density_choices     <- pick0_or_1(density_vars)
competition_choices <- pick0_or_1(competition_vars)
demographic_choices <- pick0_or_1(demographic_vars)
income_choices      <- pick0_or_1(income_vars)
refinery_choices    <- pick0_or_1(refinery_vars)

#    - other_vars: összes részhalmaz (Ø is)
power_set <- function(v) {
  if (length(v) == 0) return(list(character(0)))
  unlist(lapply(0:length(v), function(k) combn(v, k, simplify = FALSE)), recursive = FALSE)
}
other_subsets <- power_set(other_vars)

# 2) Végigmegyünk az összes kombináción és futtatjuk a modelleket
results <- vector("list", length(density_choices) *
                            length(competition_choices) *
                            length(demographic_choices) *
                            length(income_choices) *
                            length(refinery_choices) *
                            length(other_subsets))

ix <- 1L

# Opcionális: egyszerű progress jelző
total <- length(results)
pb <- utils::txtProgressBar(min = 0, max = total, style = 3)

for (d in density_choices) {
  for (c in competition_choices) {
    for (g in demographic_choices) {
      for (inc in income_choices) {
        for (r in refinery_choices) {
          for (o in other_subsets) {

            sel_vars <- c(useful_vars, d, c, g, inc, r, o)
            # Ha véletlenül üres (elvileg nem lehet, mert sure_useful mindig van), védjük:
            if (length(sel_vars) == 0) sel_vars <- "1"

            frm <- as.formula(paste("Diff ~", paste(sel_vars, collapse = " + ")))

            fit <- try(lm(frm, data = fuel_prices), silent = TRUE)
            if (inherits(fit, "try-error")) {
              # Ha valamiért elhasal egy kombináció, jelöljük NA-val (ritka)
              results[[ix]] <- tibble(
                BIC = NA_real_, AIC = NA_real_, n_vars = length(sel_vars),
                density_pick     = ifelse(length(d)==0, "(none)", paste(d, collapse="+")),
                competition_pick = ifelse(length(c)==0, "(none)", paste(c, collapse="+")),
                demographic_pick = ifelse(length(g)==0, "(none)", paste(g, collapse="+")),
                income_pick      = ifelse(length(inc)==0, "(none)", paste(inc, collapse="+")),
                refinery_pick    = ifelse(length(r)==0, "(none)", paste(r, collapse="+")),
                other_pick       = ifelse(length(o)==0, "(none)", paste(o, collapse="+")),
                formula_str      = paste("Diff ~", paste(sel_vars, collapse = " + "))
              )
            } else {
              results[[ix]] <- tibble(
                BIC = BIC(fit),
                AIC = AIC(fit),
                n_vars = length(sel_vars),
                density_pick     = ifelse(length(d)==0, "(none)", paste(d, collapse="+")),
                competition_pick = ifelse(length(c)==0, "(none)", paste(c, collapse="+")),
                demographic_pick = ifelse(length(g)==0, "(none)", paste(g, collapse="+")),
                income_pick      = ifelse(length(inc)==0, "(none)", paste(inc, collapse="+")),
                refinery_pick    = ifelse(length(r)==0, "(none)", paste(r, collapse="+")),
                other_pick       = ifelse(length(o)==0, "(none)", paste(o, collapse="+")),
                formula_str      = paste("Diff ~", paste(sel_vars, collapse = " + "))
              )
            }

            ix <- ix + 1L
            if ((ix - 1L) %% 250 == 0) utils::setTxtProgressBar(pb, ix - 1L)
          }
        }
      }
    }
  }
}
close(pb)

# 3) Eredmények összefűzése és Top-15 BIC szerint
res_df <- bind_rows(results)

# Ha bárhol NA lett (ritka), dobjuk:
res_df <- res_df %>% filter(!is.na(BIC))

top15 <- res_df %>%
  arrange(AIC, BIC, n_vars) %>%   # elsődlegesen BIC, majd AIC, végül kevesebb változó előnyben
  slice(1:n) %>%
  select(
    BIC, AIC, n_vars,
    density_pick, competition_pick, demographic_pick,
    income_pick, refinery_pick, other_pick,
    formula_str
  )

# 4) Top-15 megjelenítése
print(top15, n = n)

[90m# A tibble: 20 × 10[39m
     BIC   AIC n_vars density_pick competition_pick demographic_pick income_pick
   [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m  [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m        [3m[90m<chr>[39m[23m            [3m[90m<chr>[39m[23m            [3m[90m<chr>[39m[23m      
[90m 1[39m [4m5[24m791. [4m5[24m710.     14 (none)       (none)           Population       (none)     
[90m 2[39m [4m5[24m791. [4m5[24m710.     14 (none)       (none)           Dwellings        (none)     
[90m 3[39m [4m5[24m792. [4m5[24m710.     14 (none)       (none)           (none)           (none)     
[90m 4[39m [4m5[24m797. [4m5[24m710.     15 (none)       (none)           (none)           (none)     
[90m 5[39m [4m5[24m787. [4m5[24m710.     13 (none)       (none)           (none)           (none)     
[90m 6[39m [4m5[24m797. [4m5[24m710.     15 (none)       (none)           Population       (none)     
[90m 7[39m [4m5[2

In [49]:
library(rsample)
library(dplyr)
library(purrr)
library(tibble)
library(yardstick)

# Feltételezzük: res_df már létezik és tartalmazza a 36k modell BIC/AIC + formula_str mezőit

# Top 50 BIC szerint
top50 <- res_df %>%
  arrange(BIC) %>%
  slice(1:50)

# 5-fold CV a TELJES adathalmazon (nincs külön train/test)
set.seed(120)
cv5 <- vfold_cv(fuel_prices, v = 5)

# Segédfüggvény: átlagos out-of-sample R^2 egy formulára 5-fold CV-ben
cv_r2 <- function(formula_str, cv_obj, data) {
  fold_r2 <- map_dbl(cv_obj$splits, function(s) {
    train_fold <- analysis(s)
    valid_fold <- assessment(s)

    fit <- lm(as.formula(formula_str), data = train_fold)
    preds <- predict(fit, newdata = valid_fold)
    yardstick::rsq_trad_vec(truth = valid_fold$Diff, estimate = preds)
  })
  mean(fold_r2)
}

# Futtatás az 50 legjobb modellre
cv_results <- map_dfr(seq_len(nrow(top50)), function(i) {
  fstr <- top50$formula_str[i]
  r2cv <- cv_r2(fstr, cv5, fuel_prices)
  tibble(
    model_rank_BIC = i,
    mean_cv_r2 = r2cv,
    BIC = top50$BIC[i],
    AIC = top50$AIC[i],
    n_vars = top50$n_vars[i],
    density_pick     = top50$density_pick[i],
    competition_pick = top50$competition_pick[i],
    demographic_pick = top50$demographic_pick[i],
    income_pick      = top50$income_pick[i],
    refinery_pick    = top50$refinery_pick[i],
    other_pick       = top50$other_pick[i],
    formula_str      = fstr
  )
})

# 10 legjobb modell out-of-sample R^2 alapján
top10_cv <- cv_results %>%
  arrange(desc(mean_cv_r2), BIC, AIC, n_vars) %>%
  slice(1:10)

print(top10_cv, n = 10)


[90m# A tibble: 10 × 12[39m
   model_rank_BIC mean_cv_r2   BIC   AIC n_vars density_pick    competition_pick
            [3m[90m<int>[39m[23m      [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m  [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m           [3m[90m<chr>[39m[23m           
[90m 1[39m             22      0.159 [4m5[24m782. [4m5[24m711.     12 (none)          (none)          
[90m 2[39m             24      0.158 [4m5[24m783. [4m5[24m711.     12 (none)          (none)          
[90m 3[39m             27      0.158 [4m5[24m784. [4m5[24m713.     12 (none)          (none)          
[90m 4[39m              2      0.158 [4m5[24m778. [4m5[24m712.     11 (none)          (none)          
[90m 5[39m             26      0.158 [4m5[24m783. [4m5[24m712.     12 popper_modified (none)          
[90m 6[39m             28      0.158 [4m5[24m785. [4m5[24m713.     12 (none)          (none)          
[90m 7[39m         

In [50]:
top10_cv


model_rank_BIC,mean_cv_r2,BIC,AIC,n_vars,density_pick,competition_pick,demographic_pick,income_pick,refinery_pick,other_pick,formula_str
<int>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
22,0.1591422,5782.416,5711.26,12,(none),(none),(none),(none),(none),dummy_Hunpetrol_2018_Kft+dummy_Shell,Diff ~ dummy_GROVI_Kft + dummy_Oil + dummy_Volnbusz_Zrt + dummy_Mol + dummy_Orlen + dummy_Concordia_Trans_Kft + dummy_Auchan + dummy_Mobil_Petrol + dummy_Oplus_Kft + dummy_Omv + dummy_Hunpetrol_2018_Kft + dummy_Shell
24,0.1582876,5782.631,5711.476,12,(none),(none),(none),(none),(none),dummy_varos+dummy_Shell,Diff ~ dummy_GROVI_Kft + dummy_Oil + dummy_Volnbusz_Zrt + dummy_Mol + dummy_Orlen + dummy_Concordia_Trans_Kft + dummy_Auchan + dummy_Mobil_Petrol + dummy_Oplus_Kft + dummy_Omv + dummy_varos + dummy_Shell
27,0.15777,5783.898,5712.742,12,(none),(none),(none),(none),(none),dummy_megyei_jogu_varos+dummy_Shell,Diff ~ dummy_GROVI_Kft + dummy_Oil + dummy_Volnbusz_Zrt + dummy_Mol + dummy_Orlen + dummy_Concordia_Trans_Kft + dummy_Auchan + dummy_Mobil_Petrol + dummy_Oplus_Kft + dummy_Omv + dummy_megyei_jogu_varos + dummy_Shell
2,0.1577579,5777.984,5711.911,11,(none),(none),(none),(none),(none),dummy_Shell,Diff ~ dummy_GROVI_Kft + dummy_Oil + dummy_Volnbusz_Zrt + dummy_Mol + dummy_Orlen + dummy_Concordia_Trans_Kft + dummy_Auchan + dummy_Mobil_Petrol + dummy_Oplus_Kft + dummy_Omv + dummy_Shell
26,0.1577327,5783.478,5712.322,12,popper_modified,(none),(none),(none),(none),dummy_Shell,Diff ~ dummy_GROVI_Kft + dummy_Oil + dummy_Volnbusz_Zrt + dummy_Mol + dummy_Orlen + dummy_Concordia_Trans_Kft + dummy_Auchan + dummy_Mobil_Petrol + dummy_Oplus_Kft + dummy_Omv + popper_modified + dummy_Shell
28,0.1576312,5784.628,5713.472,12,(none),(none),(none),(none),(none),dummy_varos+dummy_megyei_jogu_varos,Diff ~ dummy_GROVI_Kft + dummy_Oil + dummy_Volnbusz_Zrt + dummy_Mol + dummy_Orlen + dummy_Concordia_Trans_Kft + dummy_Auchan + dummy_Mobil_Petrol + dummy_Oplus_Kft + dummy_Omv + dummy_varos + dummy_megyei_jogu_varos
25,0.1576239,5782.702,5711.546,12,(none),(none),Dwellings,(none),(none),dummy_Shell,Diff ~ dummy_GROVI_Kft + dummy_Oil + dummy_Volnbusz_Zrt + dummy_Mol + dummy_Orlen + dummy_Concordia_Trans_Kft + dummy_Auchan + dummy_Mobil_Petrol + dummy_Oplus_Kft + dummy_Omv + Dwellings + dummy_Shell
41,0.1576131,5785.367,5714.211,12,(none),(none),(none),(none),(none),dummy_varos+dummy_Hunpetrol_2018_Kft,Diff ~ dummy_GROVI_Kft + dummy_Oil + dummy_Volnbusz_Zrt + dummy_Mol + dummy_Orlen + dummy_Concordia_Trans_Kft + dummy_Auchan + dummy_Mobil_Petrol + dummy_Oplus_Kft + dummy_Omv + dummy_varos + dummy_Hunpetrol_2018_Kft
48,0.1575461,5786.115,5714.959,12,(none),(none),(none),(none),(none),dummy_megyei_jogu_varos+dummy_Hunpetrol_2018_Kft,Diff ~ dummy_GROVI_Kft + dummy_Oil + dummy_Volnbusz_Zrt + dummy_Mol + dummy_Orlen + dummy_Concordia_Trans_Kft + dummy_Auchan + dummy_Mobil_Petrol + dummy_Oplus_Kft + dummy_Omv + dummy_megyei_jogu_varos + dummy_Hunpetrol_2018_Kft
23,0.1575138,5782.612,5711.456,12,(none),(none),Population,(none),(none),dummy_Shell,Diff ~ dummy_GROVI_Kft + dummy_Oil + dummy_Volnbusz_Zrt + dummy_Mol + dummy_Orlen + dummy_Concordia_Trans_Kft + dummy_Auchan + dummy_Mobil_Petrol + dummy_Oplus_Kft + dummy_Omv + Population + dummy_Shell
