# 02 – Pretprocesiranje i integracija podataka

U ovom notebooku provodimo:
- čišćenje i pretvorbu tipova (CSV + JSON->tablica)
- standardizaciju naziva stupaca
- integraciju u jedinstveni skup za pohranu u bazu (SQLite)


Učitavanje podataka (iz data_raw)

In [1]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().parent  # notebooks/ -> project root
DATA_RAW = PROJECT_ROOT / "data_raw"

dt = pd.read_csv(DATA_RAW / "DT.csv")
smp = pd.read_csv(DATA_RAW / "social_media_vs_productivity.csv")
tw = pd.read_csv(DATA_RAW / "Time-Wasters on Social Media.csv")
iu = pd.read_csv(DATA_RAW / "numberofinternetusers new.csv")

print("Loaded:")
print("DT:", dt.shape)
print("SMP:", smp.shape)
print("TW:", tw.shape)
print("IU:", iu.shape)


Loaded:
DT: (13, 2)
SMP: (30000, 19)
TW: (1000, 31)
IU: (6192, 4)


DT: Hours:Minutes -> minute

In [2]:
def hhmm_to_minutes(x):
    if pd.isna(x):
        return None
    s = str(x).strip()
    # očekujemo "HH:MM"
    if ":" not in s:
        return None
    hh, mm = s.split(":", 1)
    try:
        return int(hh) * 60 + int(mm)
    except:
        return None

dt_clean = dt.copy()
dt_clean["daily_minutes"] = dt_clean["Daily Time (Hours:Minutes)"].apply(hhmm_to_minutes)
dt_clean = dt_clean.drop(columns=["Daily Time (Hours:Minutes)"])

display(dt_clean.head())
print(dt_clean[["Year","daily_minutes"]].isna().sum())


Unnamed: 0,Year,daily_minutes
0,2012,90
1,2013,95
2,2014,104
3,2015,111
4,2016,128


Year             0
daily_minutes    0
dtype: int64


SMP: standardizacija i osnovno čišćenje

In [None]:
smp_clean = smp.copy()

# nazive stupaca (lowercase + underscore)
smp_clean.columns = [c.strip().lower() for c in smp_clean.columns]

key_cols = ["daily_social_media_time", "perceived_productivity_score", "actual_productivity_score"]
existing_key_cols = [c for c in key_cols if c in smp_clean.columns]

if existing_key_cols:
    before = len(smp_clean)
    smp_clean = smp_clean.dropna(subset=existing_key_cols)
    print("Dropped rows (missing key cols):", before - len(smp_clean))

display(smp_clean.head())


Dropped rows (missing key cols): 6270


Unnamed: 0,age,gender,job_type,daily_social_media_time,social_platform_preference,number_of_notifications,work_hours_per_day,perceived_productivity_score,actual_productivity_score,stress_level,sleep_hours,screen_time_before_sleep,breaks_during_work,uses_focus_apps,has_digital_wellbeing_enabled,coffee_consumption_per_day,days_feeling_burnout_per_month,weekly_offline_hours,job_satisfaction_score
0,56,Male,Unemployed,4.18094,Facebook,61,6.753558,8.040464,7.291555,4.0,5.116546,0.419102,8,False,False,4,11,21.927072,6.336688
1,46,Male,Health,3.249603,Twitter,59,9.169296,5.063368,5.165093,7.0,5.103897,0.671519,7,True,True,2,25,0.0,3.412427
6,56,Female,Unemployed,4.38107,TikTok,60,3.902309,6.420989,5.976408,7.0,7.549849,2.252624,4,False,False,4,20,24.084905,5.501373
7,36,Female,Education,4.089168,Twitter,49,6.560467,2.68183,2.446927,4.0,6.325507,0.747998,2,False,False,4,29,8.419648,3.444376
8,40,Female,Education,4.097401,Instagram,57,5.83959,3.219022,3.00424,4.0,,0.0,10,False,True,2,10,0.0,1.960131
