### ***Cleaned Dataset***

In [4]:
from IPython.display import display
import pandas as pd

In [5]:
saham_files = {
    'GASOLINE': 'Gasoline.csv',
}

In [6]:
def check_data(saham_files):
    start_dates = []
    end_dates = []
    
    for kode, filepath in saham_files.items():
        data = pd.read_csv(filepath, index_col=0, parse_dates=True)
        start_dates.append(data.index[0])
        end_dates.append(data.index[-1])

    if all(date == start_dates[0] for date in start_dates) and all(date == end_dates[0] for date in end_dates):
        print(f"\033[92mTidak ada perbedaan data, tanggal/bulan/tahun awal dan akhir sama: {start_dates[0].strftime('%Y-%m-%d')} / {end_dates[0].strftime('%Y-%m-%d')}")
        return None
    
    start_end_dates = {}
    i = 0
    for kode, filepath in saham_files.items():
        start_end_dates[kode] = {'Tanggal Awal': start_dates[i].strftime('%Y-%m-%d'), 'Tanggal Akhir': end_dates[i].strftime('%Y-%m-%d')}
        i += 1

    df = pd.DataFrame(start_end_dates).T

    max_start_date = max(pd.to_datetime(df['Tanggal Awal']))
    max_end_date   = max(pd.to_datetime(df['Tanggal Akhir']))

    def color_cell(value):
        color = 'red' if (pd.to_datetime(value) == max_start_date or pd.to_datetime(value) == max_end_date) else ''
        return f'background-color: {color}'

    df = df.style.apply(lambda row: [color_cell(value) for value in row], axis=1)

    return df


In [7]:
data_1 = check_data(saham_files)
if data_1 is not None:
    display(data_1)

[92mTidak ada perbedaan data, tanggal/bulan/tahun awal dan akhir sama: 2016-01-01 / 2021-10-29


In [9]:
start_date = '2016-01-01'
end_date = '2021-10-29'
freq = 'D'

for kode, filepath in saham_files.items():
    data = pd.read_csv(filepath)
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    filtered_data = data[(data['timestamp'] >= start_date) & (data['timestamp'] <= end_date)]
    filtered_data = filtered_data[['timestamp', 'qty']]
    filtered_data.rename(columns={'close': kode}, inplace=True)

    new_index = pd.date_range(start=filtered_data['timestamp'].min(), end=filtered_data['timestamp'].max(), freq=freq)
    filtered_data.set_index('timestamp', inplace=True)
    filtered_data = filtered_data.reindex(new_index)
    filtered_data = filtered_data.ffill()
    filtered_data = filtered_data.astype(int)
    filtered_data.index.freq = freq
    print(f"Nama dataset: {kode}, Frekuensi dirubah menjadi: {filtered_data.index.freq}")
    filtered_data.to_csv(f"{kode}.csv")

Nama dataset: GASOLINE, Frekuensi dirubah menjadi: <Day>


In [10]:
data_2 = check_data(saham_files)
if data_2 is not None:
    display(data_2)

[92mTidak ada perbedaan data, tanggal/bulan/tahun awal dan akhir sama: 2016-01-01 / 2021-10-29


In [11]:
for kode, filepath in saham_files.items():
    data = pd.read_csv(filepath)
    num_rows = len(data)
    print(f"Jumlah baris data untuk {kode}: {num_rows}")

Jumlah baris data untuk GASOLINE: 2129


In [8]:
data = pd.read_csv("data/BBCA.csv", index_col=0, parse_dates=True)
data

Unnamed: 0,BBCA
2020-02-03,32200
2020-02-04,33000
2020-02-05,33650
2020-02-06,33700
2020-02-07,33800
...,...
2024-05-30,9000
2024-05-31,9250
2024-06-03,9275
2024-06-04,9350


In [9]:
dfs = []
for saham, file_path in saham_files.items():
    df = pd.read_csv(file_path, index_col=0, parse_dates=True)
    dfs.append(df)

dataset = pd.concat(dfs, axis=1)
dataset

Unnamed: 0,BBCA,BBRI,BMRI,BYAN,TPIA,TLKM,BBNI,ASII,ICBP
2020-02-03,32200,4460,7500,15200,8750,3740,7125,6275,11400
2020-02-04,33000,4560,7525,15200,8750,3760,7350,6400,11600
2020-02-05,33650,4560,7700,15200,8725,3770,7425,6400,11500
2020-02-06,33700,4550,7700,15200,8725,3760,7350,6500,11500
2020-02-07,33800,4550,7725,15200,8800,3790,7350,6400,11500
...,...,...,...,...,...,...,...,...,...
2024-05-30,9000,4380,5875,18200,9150,2810,4490,4320,10150
2024-05-31,9250,4340,5900,17975,9175,2900,4400,4290,9750
2024-06-03,9275,4530,6100,18000,9200,2930,4620,4500,10200
2024-06-04,9350,4450,6150,17850,9600,3000,4640,4580,10325


In [10]:
name = 'top_9_stock.csv'
print(f"Data saham yang digabungkan telah disimpan sebagai {name}")

Data saham yang digabungkan telah disimpan sebagai top_9_stock.csv


In [11]:
nan_values = dataset[dataset.isna().any(axis=1)]
nan_dataframe = pd.DataFrame(nan_values)

if not nan_dataframe.empty:
    display(nan_dataframe)
else:
    print("Tidak ditemukan nilai NaN dalam dataset.")

Tidak ditemukan nilai NaN dalam dataset.


In [15]:
get_data = '2024-02-04'
if get_data in dataset.index:
    data = dataset.loc[get_data]
    print(data.to_frame().T)
else:
    print(f"Tidak ditemukan data untuk tanggal {get_data}, kemungkinan adalah hari weekend.")

Tidak ditemukan data untuk tanggal 2024-02-04, kemungkinan adalah hari weekend.


In [13]:
dataset.to_csv(f'data/{name}')
print(f"Dataset telah disimpan dengan frekuensi: {freq}")

Dataset telah disimpan dengan frekuensi: B


In [14]:
dataset

Unnamed: 0,BBCA,BBRI,BMRI,BYAN,TPIA,TLKM,BBNI,ASII,ICBP
2020-02-03,32200,4460,7500,15200,8750,3740,7125,6275,11400
2020-02-04,33000,4560,7525,15200,8750,3760,7350,6400,11600
2020-02-05,33650,4560,7700,15200,8725,3770,7425,6400,11500
2020-02-06,33700,4550,7700,15200,8725,3760,7350,6500,11500
2020-02-07,33800,4550,7725,15200,8800,3790,7350,6400,11500
...,...,...,...,...,...,...,...,...,...
2024-05-30,9000,4380,5875,18200,9150,2810,4490,4320,10150
2024-05-31,9250,4340,5900,17975,9175,2900,4400,4290,9750
2024-06-03,9275,4530,6100,18000,9200,2930,4620,4500,10200
2024-06-04,9350,4450,6150,17850,9600,3000,4640,4580,10325
