In [40]:
import pandas as pd
import numpy as np
import zipfile, timeit


In [41]:
zip_path = r"C:\Users\Даня\Labubu\individual+household+electric+power+consumption.zip"
with zipfile.ZipFile(zip_path, "r") as z:
    fname = [f for f in z.namelist() if f.endswith(".txt")][0]
    df = pd.read_csv(
        z.open(fname), sep=';', na_values='?', 
        parse_dates={'DateTime': ['Date','Time']}, low_memory=False
    )

# Перетворення числових колонок
df = df.apply(lambda col: pd.to_numeric(col, errors='coerce') if col.name != "DateTime" else col)
df = df.dropna(subset=['DateTime']).ffill().bfill()
df.fillna(df.mean(numeric_only=True), inplace=True)

# Короткі назви
df.rename(columns={
    'Global_active_power':'GAP_kW','Global_reactive_power':'GRP_kVAR',
    'Voltage':'Voltage_V','Global_intensity':'Intensity_A',
    'Sub_metering_1':'Kitchen_Wh','Sub_metering_2':'Laundry_Wh',
    'Sub_metering_3':'Water_heater_Wh'
}, inplace=True)

  df = pd.read_csv(
  df = pd.read_csv(


In [42]:
def select_gap_gt_5(d): return d[d['GAP_kW']>5]

def select_intensity_19_20(d):
    d = d[(d['Intensity_A'].between(19,20))]
    return d[(d['Laundry_Wh']+d['Kitchen_Wh']) > d['Water_heater_Wh']]

def sample_mean(d,n=500_000): 
    return d.sample(min(len(d),n),replace=False,random_state=42).mean(numeric_only=True)

def select_after18_group2(d):
    d = d[(d['DateTime'].dt.hour>=18) & (d['GAP_kW']>6)]
    g2 = d['Laundry_Wh']+d['Water_heater_Wh']
    dom = d[g2 > d['Kitchen_Wh']]
    half=len(dom)//2
    return pd.concat([dom.iloc[:half:3], dom.iloc[half::4]])


In [43]:
def normalize_std(d):
    num = d.select_dtypes(np.number)
    norm = (num-num.min())/(num.max()-num.min())
    std  = (num-num.mean())/num.std(ddof=0)
    norm.columns=[c+"_norm" for c in num]; std.columns=[c+"_std" for c in num]
    return pd.concat([d.reset_index(drop=True),norm,std],axis=1)


In [44]:
def correlations(d):
    x = d['GAP_kW']
    y = d['Intensity_A']
    pearson = x.corr(y, method='pearson')
    spearman = x.rank().corr(y.rank(), method='pearson')  # Спірмен як кореляція рангів
    return dict(pearson=pearson, spearman=spearman)

def one_hot_hour(d):
    d=d.copy(); d['hour']=d['DateTime'].dt.hour
    bins=[0,6,12,18,24]; labels=['night','morning','afternoon','evening']
    d['hour_bin']=pd.cut(d['hour'],bins=bins,labels=labels,right=False)
    return pd.get_dummies(d,columns=['hour_bin'])


In [45]:
def profile(f,*a): return timeit.repeat(lambda: f(*a),repeat=3,number=1)

In [46]:
sel1 = select_gap_gt_5(df)
sel2 = select_intensity_19_20(df)
mean500k = sample_mean(df)
sel3 = select_after18_group2(df)

df_normstd = normalize_std(df.head(10000))
corrs = correlations(df)
df_encoded = one_hot_hour(df.head(1000))

print("Time GAP>5:", profile(select_gap_gt_5,df))
print("Кореляції:", corrs)
print("One-hot приклад:\n", df_encoded.head())

Time GAP>5: [0.013247499999124557, 0.012852600018959492, 0.012608199962414801]
Кореляції: {'pearson': np.float64(0.9988840298467077), 'spearman': np.float64(0.9954260476051768)}
One-hot приклад:
              DateTime  GAP_kW  GRP_kVAR  Voltage_V  Intensity_A  Kitchen_Wh  \
0 2006-12-16 17:24:00   4.216     0.418     234.84         18.4         0.0   
1 2006-12-16 17:25:00   5.360     0.436     233.63         23.0         0.0   
2 2006-12-16 17:26:00   5.374     0.498     233.29         23.0         0.0   
3 2006-12-16 17:27:00   5.388     0.502     233.74         23.0         0.0   
4 2006-12-16 17:28:00   3.666     0.528     235.68         15.8         0.0   

   Laundry_Wh  Water_heater_Wh  hour  hour_bin_night  hour_bin_morning  \
0         1.0             17.0    17           False             False   
1         1.0             16.0    17           False             False   
2         2.0             17.0    17           False             False   
3         1.0             17.0   