In [4]:
import pandas as pd

# 1. Load the combined dataset before cleaning
df = pd.read_csv('../data/clean/combined_before_title_cleaning.csv')

# 2. Normalize column names to lowercase
df.columns = df.columns.str.lower()

# 3. Normalize the 'name' column
df['name'] = (
    df['name']
    .str.lower()
    .str.strip()
    .str.replace(r"[^a-z0-9 ]", "", regex=True)   # remove special characters
    .str.replace(r"\s+", " ", regex=True)         # normalize whitespace
)

# 4. Count how many games appear on multiple platforms
name_platform_counts = df.groupby('name')['platform'].nunique().sort_values(ascending=False)
multi_platform_titles = name_platform_counts[name_platform_counts > 1]

print(f"Multiplatform titles: {len(multi_platform_titles)}")

# 5. Convert year_of_release to integer (only after dropping nulls)
df = df[df['year_of_release'].notna()]
df['year_of_release'] = df['year_of_release'].astype(int)

# 6. Drop duplicates (same game, platform, and release year)
df_dedup = df.drop_duplicates(
    subset=['name', 'platform', 'year_of_release'],
    keep='first'
).copy()

print(f"Dataset after eliminating duplicates: {df_dedup.shape}")

# 7. Sanity check
assert df_dedup.duplicated(subset=['name', 'platform', 'year_of_release']).sum() == 0


FileNotFoundError: [Errno 2] No such file or directory: '../data/clean/combined_before_title_cleaning.csv'

In [None]:
import sys
print(sys.executable)


In [None]:
# Null check
print("Missing values:\n", df_dedup.isnull().sum())

# Duplicate check
dup_check = df_dedup.duplicated(subset=['name', 'platform', 'year_of_release'])
print(f"\nDuplicates: {dup_check.sum()}")


In [None]:
df['genre'] = df['genre'].replace({
    'platform': 'platformer',
    'role-playing': 'rpg'
})

rare_genres = ['party', 'mmo', 'board game', 'education', 'sandbox']
df['genre'] = df['genre'].replace(rare_genres, 'other')

In [None]:
rating_map = {
    'E': 'Children',
    'EC': 'Children',
    'E10+': 'Pre-teens',
    'T': 'Teens',
    'M': 'Adults',
    'AO': 'Adults',
    'RP': 'Unrated',
    'K-A': 'Children',
    'Unrated': 'Unrated'
}


In [None]:
df['rating'] = df['rating'].map(rating_map).fillna('Unrated')


In [None]:
df_dedup['rating'] = df_dedup['rating'].map(rating_map).fillna('Unrated')

In [None]:
df_dedup.to_csv('../data/clean/video_game_sales_final_cleaned.csv', index=False)


NameError: name 'df_dedup' is not defined

In [None]:
df.columns