###1. Import Libraries

In [None]:
import pandas as pd

In [None]:
# Langkah 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##2. Membaca Dataset Awal

In [None]:
# Load dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/indonesian_movies.csv"# ganti sesuai path di Colab
df = pd.read_csv(file_path)

In [None]:
# Melihat 12 data Kotor
print("Data Kotor Awal:")
print(df.head())
print("\nInfo data Kotor Awal:")
print(df.info())

Data Kotor Awal:
   movie_id                     title  year  \
0    100001       #FriendButMarried 2  2020   
1    100002                  4 Mantan  2020   
2    100003  Aku Tahu Kapan Kamu Mati  2020   
3    100004               Anak Garuda  2020   
4    100005                 Dignitate  2020   

                                         description      genre rating  \
0  Ayudia (Mawar De Jongh) is not satisfied enoug...  Biography    13+   
1  Sara, Airin, Rachel, and Amara were accidental...   Thriller    17+   
2  After apparent death, Siena is able to see sig...     Horror    13+   
3  Good Morning Indonesia, a school for poor orph...  Adventure    13+   
4  Alfi (Al Ghazali) meets Alana (Caitlin Halderm...      Drama    17+   

   users_rating votes   languages          directors  \
0           6.5   120  Indonesian      Rako Prijanto   
1           6.4     8  Indonesian      Hanny Saputra   
2           5.4    17  Indonesian  Hadrah Daeng Ratu   
3           9.1    27  Indonesi

## 3. Data Cleansing-Standarisasi kolom languages

In [None]:
df['languages'] = df['languages'].str.strip().str.title()
df[['languages']].head()

Unnamed: 0,languages
0,Indonesian
1,Indonesian
2,Indonesian
3,Indonesian
4,Indonesian


## 4. Standarisasi genre jadi Title Case

In [None]:
df['genre'] = df['genre'].dropna().str.strip().str.title()
df[['genre']].head()

Unnamed: 0,genre
0,Biography
1,Thriller
2,Horror
3,Adventure
4,Drama


## 5. Bersihkan votes Dan ubah jadi angka

In [None]:
df['votes'] = df['votes'].astype(str).str.replace(',', '').str.strip()
df['votes'] = pd.to_numeric(df['votes'], errors='coerce')
df[['votes']].head()

Unnamed: 0,votes
0,120
1,8
2,17
3,27
4,33


## 6. Runtime, mengambil angka menit saja

In [None]:
df['runtime'] = df['runtime'].astype(str).str.extract(r'(\d+)').astype(float)
df[['votes']].head()

Unnamed: 0,votes
0,120
1,8
2,17
3,27
4,33


## 7. Menghapus Data Duplikate

In [None]:
print("\nTotal duplikat sebelum dihapus:", df.duplicated().sum())
df = df.drop_duplicates(subset=['title','year'])
print("Total duplikat setelah dihapus:", df.duplicated().sum())


Total duplikat sebelum dihapus: 0
Total duplikat setelah dihapus: 0


## 8.Data Enrichment
Menambahkan informasi baru yang berguna.

In [None]:
# Tambah kolom decade (misalnya 2020 → 2020s)
df['decade'] = (df['year']//10)*10

# Tambah kolom popularitas sederhana: votes * users_rating
df['popularity'] = df['votes'].fillna(0) * df['users_rating']
print(df.sample(5))

      movie_id                 title  year  \
1074    101075               Taksi 2  1991   
1063    101064        Double Crosser  1991   
543     100544               Keumala  2012   
752     100753      Garuda di Dadaku  2009   
1039    101040  Ranjang yang Ternoda  1994   

                                            description   genre rating  \
1074                                                NaN   Drama    NaN   
1063                                                NaN  Action    NaN   
543                                                 NaN   Drama    NaN   
752   Talented 12-year-old boy dreams of becoming a ...   Drama     SU   
1039                                                NaN   Drama    NaN   

      users_rating  votes   languages           directors  \
1074           5.4      9  Indonesian    Ismail Soebardjo   
1063           6.8     31  Indonesian              Arizal   
543            7.9     10  Indonesian  Andi Pulung Waluyo   
752            6.6    236  Indones

## 9.Menormalisasikan Kolom Rating


menormalisasikan kolom rating agar terlihat lebih rapi dan bagus untuk dikelola

In [None]:
# Fungsi normalisasi kolom rating
def normalize_rating(x):
    valid_ratings = ["13+", "17+", "21+", "Not Rated"]
    if pd.isna(x):  # kalau kosong -> Not Rated
        return "Not Rated"
    x = str(x).strip()
    if x in valid_ratings:
        return x
    else:
        return "Not Rated"
        # Terapkan fungsi ke kolom rating
df['rating'] = df['rating'].apply(normalize_rating)

# Lihat hasil distribusi rating unik
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
Not Rated,1010
13+,161
17+,93
21+,8


## 10.Mengunduh Data Yang Sudah Di Clean
serangkaian langkah untuk menyimpan dan mengunduh hasil pembersihan

In [None]:
df.to_csv("indonesian_movies_cleaned.csv", index=False)

# Download ke komputer
from google.colab import files
files.download("indonesian_movies_cleaned.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>