In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
TRAIN_PATH = "../../datasets/train.csv"
TEST_PATH = "../../datasets/test.csv"
SAMPLE_SUBMISSION_PATH = "../../datasets/sample_submission.csv"

In [6]:
train = pd.read_csv(TRAIN_PATH)
train.head()

Unnamed: 0,datetime,datetime_iso,time-zone,temp,visibility,d_point,feels,min_temp,max_temp,prssr,sea_level,grnd_level,hum,wind_spd,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds
0,283996800,1979-01-01 00:00:00+00:00,28800,24.75 Celcius,,23.89 C,25.76 C,24.28,25.22°C,1012,undetermined,,95,0.82,320.0 °,zero,0,,,100
1,284000400,1979-01-01 01:00:00+00:00,28800,24.58 C,,23.73 C,25.57 C,23.99 C,25.26 C,1012,,,95,0.96 m/s,338.0°,0,0,0.0,0.0,100
2,284004000,1979-01-01 02:00:00+00:00,28800,26.6 Celcius,unidentified,24.06 C,26.6 C,26.1 C,27.39,1012,,undetermined,86,1.22 m/s,339.0°,0,volume:zero,,,99
3,284007600,1979-01-01 03:00:00+00:00,28800,27.31 Celcius,,24.37 C,30.9 C,26.59,28.36 C,1012,,undetermined,84,1.08 m/s,342,0.13,nol,0.0,,94
4,284011200,1979-01-01 04:00:00+00:00,28800,27.41,,25.05 C,31.54 C,26.58 C,28.31 °C,1011,,undetermined,87,0.86,336.0°,0.34,nol,,0.0,100


In [7]:
def preprocess(df):
    df.drop(columns=["datetime"], inplace=True)

    # datetime_iso
    df["datetime_iso"] = pd.to_datetime(df["datetime_iso"])
    df["year"] = df["datetime_iso"].dt.year
    df["month"] = df["datetime_iso"].dt.month
    df["day"] = df["datetime_iso"].dt.day
    df["hour"] = df["datetime_iso"].dt.hour
    
    # time-zone
    df.drop(columns=["time-zone"], inplace=True)

    # temp
    df["temp"] = df["temp"].replace({
        "Celcius": "",
        "C": "",
        "°": ""
    }, regex=True).str.strip().astype("float")

    # visibility
    df["visibility"] = df["visibility"].replace({
        "-1km": "-1000",
        "-1 km": "-1000",
        "unrecognized": "gatau",
        "unknown": "gatau",
        "unidentified": "gatau",
        "undefined": "gatau",
        "missing": "gatau",
        "empty": "gatau",
        "m": "",
    }, regex=True).str.strip().fillna("gatau").replace({
        "": "gatau"
    })
    
    # d_point
    df["d_point"] = df["d_point"].replace({
        "Celcius": "",
        "C": "",
        "°": ""
    }, regex=True).str.strip().astype("float")


    # feels
    df["feels"] = df["feels"].replace({
        "Celcius": "",
        "C": "",
        "°": ""
    }, regex=True).str.strip().astype("float")


    # min_temp
    df["min_temp"] = df["min_temp"].replace({
        "Celcius": "",
        "C": "",
        "°": ""
    }, regex=True).str.strip().astype("float")

    # max_temp
    df["max_temp"] = df["max_temp"].replace({
        "Celcius": "",
        "C": "",
        "°": ""
    }, regex=True).str.strip().astype("float")


    # prssr
    df["prssr"] = df["prssr"].replace({
        "hPa": "",
        r"\s.": ""
    }, regex=True).str.strip().astype("float")
    
    # sea_level -> jujur ini maksudnya apaan coba, gaada semua. -1 hPa itu keknya maksudnya gaada data? sementara hapus aja dulu
    df["sea_level"].value_counts(),
    df["grnd_level"].value_counts()

    df.drop(columns=["sea_level", "grnd_level"], inplace=True)
    
    # hum
    df["hum"] = df["hum"].replace({
        "%": ""
    }, regex=True).str.strip().astype("float")
    
    # wind_spd
    df["wind_spd"] = df["wind_spd"].replace({
        "m/s": ""
    }, regex=True).str.strip().astype("float")


    # wind_deg
    df["wind_deg"] = df["wind_deg"].replace({
        "°": ""
    }, regex=True).str.strip().astype("float")
    
    # rain_3h -> 0 artinya ga hujan, -1 artinya no inpo
    df["rain_3h"] = df["rain_3h"].replace({
        "milimeter": "",
        "mm": "",
        "no-rain": "0",
        "volume:0": "0",
        "nol": "0",
        "no_rain": "0",
        "volume:zero": "0",
        "zero": "0",
    }, regex=True).str.strip().astype(float)


    # snow_1h
    df["snow_1h"] = df["snow_1h"].replace({
        "milimeter": "",
        "mm": "",
        "no-snow": "0",
        "volume:0": "0",
        "nol": "0",
        "no_snow": "0",
        "volume:zero": "0",
        "zero": "0",
    }, regex=True).str.strip().astype(float)
    
    # snow_3h
    df["snow_3h"] = df["snow_3h"].replace({
        "milimeter": "",
        "mm": "",
        "no-snow": "0",
        "volume:0": "0",
        "nol": "0",
        "no_snow": "0",
        "volume:zero": "0",
        "zero": "0",
    }, regex=True).str.strip().astype(float)

        # clouds
    df["clouds"] = df["clouds"].replace({
        "%": ""
    },regex=True).str.strip().astype(float)
    
    df.drop(columns=["datetime_iso"], inplace=True)

    df["musim"] = df["month"].astype("int")
    # 0 penghujan, 1 kemarau
    df["musim"] = df["musim"].apply(lambda x: 0 if x > 9 or x <= 5 else 1)

    return df

train = preprocess(train)