<a href="https://colab.research.google.com/github/fbrii12/DataCleansingSteam/blob/main/2318068DataCleansing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Import Libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = "/content/drive/MyDrive/Colab Notebooks/steam.csv"
df = pd.read_csv(file_path)

#2. Dataset yang kotor

In [None]:
print("Preview Dataset Kotor:")
print(df.head())

Preview Dataset Kotor:
   Unnamed: 0  appid                       name         developer publisher  \
0           0     10             Counter-Strike             Valve     Valve   
1           1     20      Team Fortress Classic             Valve     Valve   
2           2     30              Day of Defeat             Valve     Valve   
3           3     40         Deathmatch Classic             Valve     Valve   
4           4     50  Half-Life: Opposing Force  Gearbox Software     Valve   

   score_rank  positive  negative  userscore                    owners  \
0         NaN    243818      6427          0  10,000,000 .. 20,000,000   
1         NaN      7602      1136          0    1,000,000 .. 2,000,000   
2         NaN      6414       688          0   5,000,000 .. 10,000,000   
3         NaN      2618       545          0   5,000,000 .. 10,000,000   
4         NaN     24363      1198          0    2,000,000 .. 5,000,000   

   average_forever  average_2weeks  median_forever  media

#3. Mengecek Missing Values

In [None]:
print("\nCek Missing Value:")
print(df.isnull().sum())


Cek Missing Value:
Unnamed: 0             0
appid                  0
name                  18
developer            311
publisher            614
score_rank         86491
positive               0
negative               0
userscore              0
owners                 0
average_forever        0
average_2weeks         0
median_forever         0
median_2weeks          0
price                 28
initialprice          21
discount              21
ccu                    0
dtype: int64


#4. Perbaikan Missing Value

In [None]:
for col in df.columns:
    if df[col].dtype in ["int64", "float64"]:
        df[col] = df[col].fillna(df[col].median())  # numerik isi median
    else:
        df[col] = df[col].fillna(df[col].mode()[0]) # kategorikal isi modus

#5. Cek ulang perbaikan Missing Value

In [None]:
print("\nCek Ulang Missing Value:")
print(df.isnull().sum())


Cek Ulang Missing Value:
Unnamed: 0         0
appid              0
name               0
developer          0
publisher          0
score_rank         0
positive           0
negative           0
userscore          0
owners             0
average_forever    0
average_2weeks     0
median_forever     0
median_2weeks      0
price              0
initialprice       0
discount           0
ccu                0
dtype: int64


#6. Cek dan Hapus Duplikasi

In [None]:
print("\nJumlah data duplikat:", df.duplicated().sum())
df = df.drop_duplicates()


Jumlah data duplikat: 0


#7. Cek Outlier dengan metode IQR

In [None]:
def detect_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[col] < lower) | (data[col] > upper)]

for col in df.select_dtypes(include=["int64","float64"]).columns:
    outliers = detect_outliers_iqr(df, col)
    print(f"Outlier di kolom {col}: {outliers.shape[0]}")

Outlier di kolom Unnamed: 0: 0
Outlier di kolom appid: 0
Outlier di kolom score_rank: 23
Outlier di kolom positive: 14237
Outlier di kolom negative: 13552
Outlier di kolom userscore: 47
Outlier di kolom average_forever: 16228
Outlier di kolom average_2weeks: 3213
Outlier di kolom median_forever: 17099
Outlier di kolom median_2weeks: 3213
Outlier di kolom price: 4055
Outlier di kolom initialprice: 5042
Outlier di kolom discount: 7947
Outlier di kolom ccu: 19229


#8. Standarisasi Format Data

In [None]:
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].astype(str).str.strip().str.lower()

In [None]:
print("=== Contoh kolom name setelah standarisasi ===")
print(df['name'].head(10))

=== Contoh kolom name setelah standarisasi ===
0                    counter-strike
1             team fortress classic
2                     day of defeat
3                deathmatch classic
4         half-life: opposing force
5                          ricochet
6                         half-life
7    counter-strike: condition zero
8    counter-strike: condition zero
9             half-life: blue shift
Name: name, dtype: object


#9. Cek Nilai Tidak Valid (contoh: umur tidak boleh negatif)

In [None]:
if "umur" in df.columns:
    invalid = df[df["umur"] < 0]
    print("\nNilai tidak valid di kolom umur:")
    print(invalid)

#10. Cek Kolom Tidak Relevan


In [None]:
if "id" in df.columns:
    df = df.drop(columns=["id"])

In [None]:
print("=== Jumlah nilai unik per kolom ===")
print(df.nunique())

=== Jumlah nilai unik per kolom ===
Unnamed: 0         86538
appid              82500
name               81705
developer          51707
publisher          45264
score_rank             3
positive            5674
negative            2818
userscore             34
owners                13
average_forever     3083
average_2weeks       914
median_forever      2550
median_2weeks        908
price                624
initialprice         357
discount              85
ccu                 1104
dtype: int64


In [None]:
# Identifikasi kolom kandidat tidak relevan
irrelevant_cols = []

# Kolom dengan 1 nilai unik
single_value_cols = df.nunique()[df.nunique() == 1].index.tolist()
irrelevant_cols.extend(single_value_cols)

In [None]:
# Hapus kolom tidak relevan
df = df.drop(columns=set(irrelevant_cols), errors='ignore')

print("\n=== Setelah perbaikan ===")
print("Kolom dataset sekarang:", df.columns.tolist())


=== Setelah perbaikan ===
Kolom dataset sekarang: ['Unnamed: 0', 'appid', 'name', 'developer', 'publisher', 'score_rank', 'positive', 'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks', 'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount', 'ccu']


#11. Dataset Setelah Cleansing

In [None]:
print("\nDataset Bersih (preview):")
print(df.head())


Dataset Bersih (preview):
   Unnamed: 0  appid                       name         developer publisher  \
0           0     10             counter-strike             valve     valve   
1           1     20      team fortress classic             valve     valve   
2           2     30              day of defeat             valve     valve   
3           3     40         deathmatch classic             valve     valve   
4           4     50  half-life: opposing force  gearbox software     valve   

   score_rank  positive  negative  userscore                    owners  \
0        99.0    243818      6427          0  10,000,000 .. 20,000,000   
1        99.0      7602      1136          0    1,000,000 .. 2,000,000   
2        99.0      6414       688          0   5,000,000 .. 10,000,000   
3        99.0      2618       545          0   5,000,000 .. 10,000,000   
4        99.0     24363      1198          0    2,000,000 .. 5,000,000   

   average_forever  average_2weeks  median_forever  m

In [None]:
print("\nCleansing selesai. Dataset steam.csv siap digunakan.")


Cleansing selesai. Dataset steam.csv siap digunakan.


#12. kesimpulan


In [None]:
# 12. Kesimpulan
print("\n=== KESIMPULAN ===")
print("1. Dataset 'steam.csv' berhasil dibaca sebagai dataset kotor.")
print("2. Missing value berhasil diperbaiki dengan metode median (numerik) dan modus (kategorikal).")
print("3. Duplikasi data berhasil dihapus sehingga dataset lebih bersih.")
print("4. Outlier terdeteksi pada beberapa kolom numerik (cek hasil sebelumnya untuk detail).")
print("5. Format data pada kolom string sudah distandarisasi (huruf kecil & tanpa spasi berlebih).")
print("6. Nilai tidak valid (misalnya umur negatif) berhasil teridentifikasi.")
print("7. Kolom yang tidak relevan (misalnya ID) dihapus agar dataset lebih ringkas.")
print("8. Dataset akhir disimpan dengan nama 'steam_clean.csv' di Google Drive.")
print("=> Dataset kini sudah siap digunakan untuk analisis lebih lanjut atau pemodelan machine learning.")


=== KESIMPULAN ===
1. Dataset 'steam.csv' berhasil dibaca sebagai dataset kotor.
2. Missing value berhasil diperbaiki dengan metode median (numerik) dan modus (kategorikal).
3. Duplikasi data berhasil dihapus sehingga dataset lebih bersih.
4. Outlier terdeteksi pada beberapa kolom numerik (cek hasil sebelumnya untuk detail).
5. Format data pada kolom string sudah distandarisasi (huruf kecil & tanpa spasi berlebih).
6. Nilai tidak valid (misalnya umur negatif) berhasil teridentifikasi.
7. Kolom yang tidak relevan (misalnya ID) dihapus agar dataset lebih ringkas.
8. Dataset akhir disimpan dengan nama 'steam_clean.csv' di Google Drive.
=> Dataset kini sudah siap digunakan untuk analisis lebih lanjut atau pemodelan machine learning.
