In [1]:
import getml
from challenge.utils.data import load_ctu_dataset

getml.set_project("db_transformer_accidents")

# Task: Accidents
### Dataset Description
> <span style="font-weight: 500; color: #3b3b3b;">ⓘ️&nbsp; Generated by `gpt-4o`</span>
>
> The *Accidents* dataset contains records of traffic accidents in Ljubljana, Slovenia, from 1995 to 2005.
> 
> - **Data Model:**
>   - *oseba* table:
>     - `id_nesreca` (char): Accident ID.
>     - Various columns for personal details and test results (char, tinyint, decimal).
>   - *nesreca* table:
>     - `id_nesreca` (char): Accident ID.
>     - `klas_nesreca` (char): Accident classification.
>     - Various columns for accident details, location, and conditions (char, datetime, int, double).
>   - *upravna_enota* table:
>     - `id_upravna_enota` (char): Administrative unit ID.
>     - `ime_upravna_enota` (varchar): Name of the administrative unit.
>     - `st_prebivalcev` (int): Population count.
>     - `povrsina` (smallint): Area size.
> 
> - **Task:**
>   - The primary task is *classification*, with the target column being `klas_nesreca` in the *nesreca* table.
> 
> - **Column Types:**
>   - Character (`char`) for IDs and classifications.
>   - Tiny integer (`tinyint`) for small numeric values.
>   - Decimal (`decimal`) for precise numeric values.
>   - DateTime (`datetime`) for temporal data.
>   - Integer (`int`) for numeric data.
>   - Double (`double`) for geographic coordinates.
>   - Variable character (`varchar`) for names.
> 
> - **Metadata:**
>   - The dataset is real and contains missing values.
>   - It consists of 3 tables with a total of 1,453,650 rows and 43 columns.
>   - The dataset size is approximately 234.5 MB.
>   - There are 503,362 instances, with the target table being *nesreca*.
> 
> This dataset is useful for government and public safety analysis, focusing on traffic accident patterns and contributing factors.

### Tables
Population table: nesreca

<h4>
  <details open>
     <summary>ER Diagram</summary>
       <img src="https://relational.fel.cvut.cz/assets/img/datasets-generated/Accidents.svg" alt="Accidents ER Diagram">
   </details>
</h4>

To load the dataset, we use the `load_ctu_dataset` function from the `utils`
module. This function returns a tuple with the population table as the first
element and the a dictionary of peripheral tables as the second element.

In [2]:
nesreca, peripheral = load_ctu_dataset("Accidents")

(
    oseba,
    upravna_enota,
) = peripheral.values()

Analyzing schema:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading tables:   0%|          | 0/3 [00:00<?, ?it/s]

Building data:   0%|          | 0/3 [00:00<?, ?it/s]

Now, we can inspect all tables and annotate the columns with [roles](https://getml.com/latest/user_guide/concepts/annotating_data/).

The population table (`nesreca`). We already set the `target` role for the target (`klas_nesreca`). If the task is a multiclass classification,
we split the target column into multiple columns in an one-vs-all fashion. In this case, the original target is still avaiable as `klas_nesreca`.

In [3]:
# TODO: Annotate remaining columns with roles
nesreca

name,klas_nesreca=0,klas_nesreca=1,klas_nesreca=2,klas_nesreca=3,klas_nesreca=4,klas_nesreca=5,klas_nesreca,id_nesreca,upravna_enota,cas_nesreca,naselje_ali_izven,kategorija_cesta,oznaka_cesta_ali_naselje,tekst_cesta_ali_naselje,oznaka_odsek_ali_ulica,tekst_odsek_ali_ulica,stacionazna_ali_hisna_st,opis_prizorisce,vzrok_nesreca,tip_nesreca,vreme_nesreca,stanje_promet,stanje_vozisce,stanje_povrsina_vozisce,x,y,x_wgs84,y_wgs84,split
role,target,target,target,target,target,target,unused_float,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string
0.0,1,0,0,0,0,0,0,036738,5564,1995-01-01 15:30:00.000000,D,V,64043,PERNICA,00000,NI ULIC,,C,PR,BT,O,R,MO,A,556102,159758,15.7271485845696,46.5795099965885,train
1.0,0,1,0,0,0,0,1,036744,5511,1995-01-02 12:45:00.000000,N,L,93117,IDRIJA-KOČEVŠE-KODER-VOJS,00000,NI ODSEKOV,,C,HI,BT,J,N,MO,A,423380,95390,14.0060339182733,45.998372031671,train
2.0,0,0,1,0,0,0,2,036755,5564,1995-01-01 06:00:00.000000,N,M,00003,MEJA A-VIČ-ORMOŽ-MEJA RH,00246,MARIBOR(TEZNO)-HAJDINA,,C,SV,ÈT,O,R,MO,A,555372,150260,15.716499936082,46.4941258818011,train
3.0,0,1,0,0,0,0,1,036756,5511,1995-01-02 20:10:00.000000,D,M,010-0,MEJA I-ROBIČ - KALCE,01034,SP.IDRIJA-GODOVIČ,0001,K,SV,ÈT,J,N,PN,A,425085,96093,14.0279317718935,46.0048857243671,train
4.0,0,1,0,0,0,0,1,036764,5524,1995-01-03 00:15:00.000000,D,N,25001,LJUBLJANA,28067,JADRANSKA ULICA,,R,PD,ÈT,O,R,SP,A,460758,100064,14.4881741063497,46.0436029798011,val
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508988.0,0,1,0,0,0,0,1,549800,5513,2006-12-19 22:00:00.000000,D,N,13004,IZOLA,00036,PITTONIJEVA ULICA,3,N,SV,TV,J,R,SU,A,395782,44457,13.6608356763765,45.5365522649472,val
508989.0,0,1,0,0,0,0,1,549802,5517,2006-12-20 14:00:00.000000,D,L,93750,177112 IZOLA-BOLN. IZOLA,93750,NI ODSEKOV,500,C,PD,BT,J,N,SU,A,399890,44778,13.7133553386726,45.5400444582009,val
508990.0,0,1,0,0,0,0,1,549803,5513,2006-12-04 13:00:00.000000,D,N,13004,IZOLA,00095,POLJE,35,N,VR,NT,O,N,MO,A,397155,44707,13.6783582421227,45.5390059636959,train
508991.0,0,1,0,0,0,0,1,549804,5513,2006-12-21 20:00:00.000000,D,N,13004,IZOLA,00005,CANKARJEV DREVORED,9,N,OS,NT,O,N,SU,A,395841,44524,13.6615766082626,45.537163839863,val


Peripheral tables,

In [4]:
# TODO: Annotate columns with roles
oseba

name,id_nesreca,povzrocitelj_ali_udelezenec,starost,spol,upravna_enota,drzavljanstvo,poskodba,vrsta_udelezenca,varnostni_pas_ali_celada,vozniski_staz_LL,vozniski_staz_MM,alkotest,strokovni_pregled,starost_d,vozniski_staz_d,alkotest_d,strokovni_pregled_d
role,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string,unused_string
0.0,036738,D,38,1,5507,005,L,TV,N,6,,0.11,0.08,D,B,B,B
1.0,036738,D,28,1,5599,211,P,OA,N,10,,0,0,C,B,A,A
2.0,036738,N,12,2,5507,005,H,PT,0,0,,,,B,A,N,N
3.0,036744,D,24,1,5511,005,B,OA,D,6,,,,C,B,N,N
4.0,036744,N,27,1,5524,005,B,OA,D,6,,,,C,B,N,N
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
954031.0,549803,N,18,2,5501,005,B,OA,1,,,0,0,B,N,A,A
954032.0,549804,D,27,1,5513,005,B,OA,1,8,9,0,0,C,B,A,A
954033.0,549804,N,25,1,5513,005,B,OA,1,6,10,0,0,C,B,A,A
954034.0,549805,D,41,1,5540,005,B,OA,1,4,1,0.96,0,E,A,C,A


In [5]:
# TODO: Annotate columns with roles
upravna_enota

name,id_upravna_enota,ime_upravna_enota,st_prebivalcev,povrsina
role,unused_string,unused_string,unused_string,unused_string
0.0,5501,Ajdovščina,23507,353
1.0,5502,Brežice,23253,268
2.0,5503,Celje,62049,230
3.0,5504,Cerknica,16155,483
4.0,5505,Črnomelj,18290,486
,...,...,...,...
59.0,5564,Maribor,145678,356
60.0,5565,Pesnica,19750,171
61.0,5568,Ruše,15054,209
62.0,5598,MNZ,0,0


The next step is to define the data model. Refer to [https://relational.fel.cvut.cz/dataset/Accidents](https://relational.fel.cvut.cz/dataset/Accidents)
for a description of the dataset.

In [6]:
dm = getml.data.DataModel(population=nesreca.to_placeholder())
dm.add(getml.data.to_placeholder(**peripheral))

# TODO
# dm.population.join(...)

Now we can create the container and add the tables to it.

In [7]:
container = getml.data.Container(population=nesreca, split=nesreca.split)
container.add(**peripheral)

container

Unnamed: 0,subset,name,rows,type
0,train,nesreca,356296,View
1,val,nesreca,152697,View

Unnamed: 0,name,rows,type
0,oseba,954036,DataFrame
1,upravna_enota,64,DataFrame
