In [41]:
import pandas as pd
import numpy as np

Menggunakan pustaka pandas (pd) untuk membaca file CSV dengan nama 'north.csv' dan memuat data dari file tersebut ke dalam sebuah DataFrame yang disimpan dalam variabel 'df'.

In [42]:
df = pd.read_csv('north_20000.csv')

Menyimpan Dataframe pandas (df) ke dalam format berkas HDF (Hierarchical Data Format) dengan nama berkas 'weather.hdf'. Data dari Dataframe akan disimpan dalam grup dengan nama 'mydata' di dalam berkas HDF. Mode 'w' digunakan untuk menulis (write) data ke dalam berkas HDF, sehingga kode ini akan membuat atau menggantikan berkas 'weather.hdf' jika sudah ada

In [43]:
df.to_hdf('weather.hdf', 'mydata', mode='w')

In [44]:
df = pd.read_hdf('weather.hdf', 'mydata')

Mengganti nama kolom dalam DataFrame 'df' dengan nama-nama kolom yang tercantum dalam daftar 'new_column_names'.

In [45]:
new_column_names = [
    'index', 'date', 'hr', 'prcp', 'stp', 'smax', 'smin', 'gbrd', 'temp', 'dewp',
    'tmax', 'tmin', 'dmax', 'dmin', 'hmax', 'hmin', 'hmdy', 'wdct', 'gust',
    'wdsp', 'region', 'prov', 'wsnm', 'inme', 'lat', 'lon', 'elvt'
]

df.columns = new_column_names

Mengubah kolom 'date' dalam DataFrame 'df' menjadi tipe data datetime. Dengan menggunakan fungsi pd.to_datetime(), tanggal dalam kolom 'date' akan diubah menjadi format datetime.

In [46]:
df["date"]= pd.to_datetime(df["date"])

Mengubah kolom 'hr' dalam DataFrame 'df' menjadi jam dengan menggunakan informasi waktu yang disediakan dalam format 'HH:MM'. Fungsi pd.to_datetime() dengan argumen format='%H:%M' mengkonversi string waktu dalam kolom 'hr' ke tipe data datetime, dan kemudian .dt.hour mengambil hanya komponen jam dari setiap tanggal dan waktunya. Hasilnya adalah kolom 'hr' yang berisi nilai jam dalam format numerik.

In [47]:
df['hr'] = pd.to_datetime(df['hr'], format='%H:%M').dt.hour

In [48]:
display(df)

Unnamed: 0,index,date,hr,prcp,stp,smax,smin,gbrd,temp,dewp,...,wdct,gust,wdsp,region,prov,wsnm,inme,lat,lon,elvt
0,0,2000-05-09,0,-9999.0,-9999.0,-9999.0,-9999.0,-9999,-9999.0,-9999.0,...,-9999,-9999.0,-9999.0,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
1,1,2000-05-09,1,-9999.0,-9999.0,-9999.0,-9999.0,-9999,-9999.0,-9999.0,...,-9999,-9999.0,-9999.0,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
2,2,2000-05-09,2,-9999.0,-9999.0,-9999.0,-9999.0,-9999,-9999.0,-9999.0,...,-9999,-9999.0,-9999.0,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
3,3,2000-05-09,3,-9999.0,-9999.0,-9999.0,-9999.0,-9999,-9999.0,-9999.0,...,-9999,-9999.0,-9999.0,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
4,4,2000-05-09,4,-9999.0,-9999.0,-9999.0,-9999.0,-9999,-9999.0,-9999.0,...,-9999,-9999.0,-9999.0,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19994,5546,2002-08-20,2,0.0,1004.6,1004.6,1004.2,-9999,25.9,24.8,...,183,1.5,0.2,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
19995,5547,2002-08-20,3,0.0,1004.9,1005.0,1004.7,-9999,25.7,24.8,...,343,0.9,0.0,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
19996,5548,2002-08-20,4,0.0,1004.5,1004.9,1004.5,-9999,26.6,24.3,...,35,2.6,0.9,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
19997,5549,2002-08-20,5,0.0,1003.8,1004.5,1003.8,-9999,26.1,22.1,...,63,3.0,0.5,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25


### Nilai yang hilang/rusak

In [49]:
df.shape

(19999, 27)

In [50]:
(df == -9999).astype(int).sum(axis=0)

index         0
date          0
hr            0
prcp       4730
stp        4740
smax       5194
smin       5194
gbrd      12005
temp       4730
dewp       4818
tmax       5192
tmin       5192
dmax       5281
dmin       5281
hmax       5280
hmin       5280
hmdy       4818
wdct       4790
gust       4904
wdsp       4743
region        0
prov          0
wsnm          0
inme          0
lat           0
lon           0
elvt          0
dtype: int64

Bisa dilihat bahwa kolom 'gbrd' memiliki data rusak (berisi nilai -9999) yang sangat besar bahkan mencapai lebih dari 50% dari total data. Maka dari itu, mengimputasi data hilang dengan cara apa pun tampaknya menjadi kurang signifikan dan tingkat akurasi menjadi rendah. Oleh karena itu, sebaiknya kolom tersebut di hapus saja.

In [51]:
df = df.drop('gbrd', axis=1)

### Menandai nilai yang hilang

In [52]:
from numpy import nan

col = ['prcp', 'stp', 'smax', 'smin', 'temp', 'dewp', 'tmax', 'tmin',
       'dmax', 'dmin', 'hmax', 'hmin', 'hmdy', 'wdct', 'gust', 'wdsp', 'elvt']

df[col] = df[col].replace(-9999, nan)

In [53]:
df.isnull().sum(axis = 0)

index        0
date         0
hr           0
prcp      4730
stp       4740
smax      5194
smin      5194
temp      4730
dewp      4818
tmax      5192
tmin      5192
dmax      5281
dmin      5281
hmax      5280
hmin      5280
hmdy      4818
wdct      4790
gust      4904
wdsp      4743
region       0
prov         0
wsnm         0
inme         0
lat          0
lon          0
elvt         0
dtype: int64

### Identifikasi pencilan dengan Standar Deviasi

In [54]:
import numpy as np

# Fungsi untuk mengidentifikasi dan menghitung outlier dalam satu kolom berdasarkan deviasi standar
def count_outliers_std(column, std_threshold):
    mean = column.mean()
    std = column.std()
    outliers = column[(column < mean - std_threshold * std) | (column > mean + std_threshold * std)]
    return len(outliers)

# Nilai ambang deviasi standar yang akan digunakan untuk mengidentifikasi outlier
std_threshold = 3  # Anda dapat mengganti angka ini sesuai dengan ambang yang diinginkan

# Iterasi melalui setiap kolom dalam DataFrame
for column in df.columns[np.r_[3:19, 23:26]]:
    num_outliers = count_outliers_std(df[column], std_threshold)
    print(f"Identified outliers in {column}: {num_outliers}")

# Tidak ada penghapusan data outlier di sini

df.head()  # DataFrame tidak diubah


Identified outliers in prcp: 148
Identified outliers in stp: 45
Identified outliers in smax: 62
Identified outliers in smin: 57
Identified outliers in temp: 4
Identified outliers in dewp: 169
Identified outliers in tmax: 1
Identified outliers in tmin: 13
Identified outliers in dmax: 148
Identified outliers in dmin: 190
Identified outliers in hmax: 88
Identified outliers in hmin: 8
Identified outliers in hmdy: 35
Identified outliers in wdct: 0
Identified outliers in gust: 94
Identified outliers in wdsp: 81
Identified outliers in lat: 0
Identified outliers in lon: 0
Identified outliers in elvt: 0


Unnamed: 0,index,date,hr,prcp,stp,smax,smin,temp,dewp,tmax,...,wdct,gust,wdsp,region,prov,wsnm,inme,lat,lon,elvt
0,0,2000-05-09,0,,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
1,1,2000-05-09,1,,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
2,2,2000-05-09,2,,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
3,3,2000-05-09,3,,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
4,4,2000-05-09,4,,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25


### Menghapus data pencilan dengan NaN

In [55]:
import numpy as np

# Fungsi untuk mengidentifikasi dan mengisi outlier dengan NaN dalam satu kolom berdasarkan deviasi standar
def replace_outliers_with_nan(column, std_threshold):
    mean = column.mean()
    std = column.std()
    lower_bound = mean - std_threshold * std
    upper_bound = mean + std_threshold * std
    column.loc[(column < lower_bound) | (column > upper_bound)] = np.nan
    return column

# Nilai ambang deviasi standar yang akan digunakan untuk mengidentifikasi outlier
std_threshold = 3  # Anda dapat mengganti angka ini sesuai dengan ambang yang diinginkan

# Iterasi melalui setiap kolom dalam DataFrame
for column in df.columns[np.r_[3:19, 23:26]]:
    df[column] = replace_outliers_with_nan(df[column], std_threshold)

df.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  column.loc[(column < lower_bound) | (column > upper_bound)] = np.nan


Unnamed: 0,index,date,hr,prcp,stp,smax,smin,temp,dewp,tmax,...,wdct,gust,wdsp,region,prov,wsnm,inme,lat,lon,elvt
0,0,2000-05-09,0,,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
1,1,2000-05-09,1,,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
2,2,2000-05-09,2,,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
3,3,2000-05-09,3,,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
4,4,2000-05-09,4,,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25


In [56]:
df.isnull().sum(axis = 0)

index        0
date         0
hr           0
prcp      4878
stp       4785
smax      5256
smin      5251
temp      4734
dewp      4987
tmax      5193
tmin      5205
dmax      5429
dmin      5471
hmax      5368
hmin      5288
hmdy      4853
wdct      4790
gust      4998
wdsp      4824
region       0
prov         0
wsnm         0
inme         0
lat          0
lon          0
elvt         0
dtype: int64

### Iterative Imputation

In [57]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Split into input and output elements
data = df.values
ix = [i for i in range(data.shape[1]) if i != data.shape[1] - 1]
X, y = data[:, ix], data[:, data.shape[1] - 1]

columns_with_missing = ['prcp', 'stp', 'smax', 'smin', 'temp', 'dewp', 'tmax', 'tmin',
       'dmax', 'dmin', 'hmax', 'hmin', 'hmdy', 'wdct', 'gust', 'wdsp', 'elvt']

# Count missing values in each column
missing_counts = df[columns_with_missing].isna().sum()

# Total missing values
total_missing = missing_counts.sum()
print(f'Total Missing: {total_missing}')

# Define the imputer (use IterativeImputer)
imputer = IterativeImputer()

# Fit and transform the dataset
df[columns_with_missing] = imputer.fit_transform(df[columns_with_missing])

# Count missing values after imputation
missing_counts_after_imputation = df[columns_with_missing].isna().sum()
total_missing_after_imputation = missing_counts_after_imputation.sum()
print(f'Total Missing after Imputation: {total_missing_after_imputation}')


Total Missing: 81310
Total Missing after Imputation: 0




In [58]:
display(df)

Unnamed: 0,index,date,hr,prcp,stp,smax,smin,temp,dewp,tmax,...,wdct,gust,wdsp,region,prov,wsnm,inme,lat,lon,elvt
0,0,2000-05-09,0,0.085053,1004.031362,1004.359689,1003.726771,26.775574,22.982606,27.299195,...,153.981376,3.307033,1.146172,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
1,1,2000-05-09,1,0.085053,1004.031362,1004.359689,1003.726771,26.775574,22.982606,27.299195,...,153.981376,3.307033,1.146172,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
2,2,2000-05-09,2,0.085053,1004.031362,1004.359689,1003.726771,26.775574,22.982606,27.299195,...,153.981376,3.307033,1.146172,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
3,3,2000-05-09,3,0.085053,1004.031362,1004.359689,1003.726771,26.775574,22.982606,27.299195,...,153.981376,3.307033,1.146172,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
4,4,2000-05-09,4,0.085053,1004.031362,1004.359689,1003.726771,26.775574,22.982606,27.299195,...,153.981376,3.307033,1.146172,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19994,5546,2002-08-20,2,0.000000,1004.600000,1004.600000,1004.200000,25.900000,24.800000,26.300000,...,183.000000,1.500000,0.200000,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
19995,5547,2002-08-20,3,0.000000,1004.900000,1005.000000,1004.700000,25.700000,24.800000,25.900000,...,343.000000,0.900000,0.000000,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
19996,5548,2002-08-20,4,0.000000,1004.500000,1004.900000,1004.500000,26.600000,24.300000,26.600000,...,35.000000,2.600000,0.900000,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
19997,5549,2002-08-20,5,0.000000,1003.800000,1004.500000,1003.800000,26.100000,22.100000,26.700000,...,63.000000,3.000000,0.500000,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25


### Imputasi KNN

In [59]:
from numpy import isnan
import pandas as pd
from sklearn.impute import KNNImputer

# Split into input and output elements
data = df.values
ix = [i for i in range(data.shape[1]) if i != data.shape[1] - 1]
X, y = data[:, ix], data[:, data.shape[1] - 1]


columns_with_missing = ['prcp', 'stp', 'smax', 'smin', 'temp', 'dewp', 'tmax', 'tmin',
       'dmax', 'dmin', 'hmax', 'hmin', 'hmdy', 'wdct', 'gust', 'wdsp', 'elvt']

# Count missing values in each column
missing_counts = df[columns_with_missing].isna().sum()

# Total missing values
total_missing = missing_counts.sum()
print(f'Total Missing: {total_missing}')

# Define the imputer
imputer = KNNImputer()

# Fit on the dataset
imputer.fit(df[columns_with_missing])

# Transform the dataset
df[columns_with_missing] = imputer.transform(df[columns_with_missing])

# Count missing values after imputation
missing_counts_after_imputation = df[columns_with_missing].isna().sum()
total_missing_after_imputation = missing_counts_after_imputation.sum()
print(f'Total Missing after Imputation: {total_missing_after_imputation}')

Total Missing: 0
Total Missing after Imputation: 0


### Statistical Imputation (mean)

In [60]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Split into input and output elements
data = df.values
ix = [i for i in range(data.shape[1]) if i != data.shape[1] - 1]
X, y = data[:, ix], data[:, data.shape[1] - 1]

columns_with_missing = ['prcp', 'stp', 'smax', 'smin', 'temp', 'dewp', 'tmax', 'tmin',
       'dmax', 'dmin', 'hmax', 'hmin', 'hmdy', 'wdct', 'gust', 'wdsp', 'elvt']

# Count missing values in each column
missing_counts = df[columns_with_missing].isna().sum()

# Total missing values
total_missing = missing_counts.sum()
print(f'Total Missing: {total_missing}')

# Define the imputer (use IterativeImputer)
imputer = SimpleImputer(strategy='mean')

# Fit and transform the dataset
df[columns_with_missing] = imputer.fit_transform(df[columns_with_missing])

# Count missing values after imputation
missing_counts_after_imputation = df[columns_with_missing].isna().sum()
total_missing_after_imputation = missing_counts_after_imputation.sum()
print(f'Total Missing after Imputation: {total_missing_after_imputation}')


Total Missing: 0
Total Missing after Imputation: 0
