In [None]:
import pandas as pd

# 1. Load the combined dataset before cleaning
df = pd.read_csv('C:/Users/PcVIP/Desktop/Bootcamp/Week7/Weekly_project_ml/video_game_sales/data/clean/combined_before_title_cleaning.csv')

# 2. Normalize column names to lowercase
df.columns = df.columns.str.lower()

# 3. Normalize the 'name' column
df['name_clean'] = (
    df['name']
    .str.lower()
    .str.strip()
    .str.replace(r"[^a-z0-9 ]", "", regex=True)   # remove special characters
    .str.replace(r"\s+", " ", regex=True)         # normalize whitespace
)

# 4. Count how many games appear on multiple platforms
name_platform_counts = df.groupby('name_clean')['platform'].nunique().sort_values(ascending=False)
multi_platform_titles = name_platform_counts[name_platform_counts > 1]

print(f"Multiplatform titles: {len(multi_platform_titles)}")

# 5. Convert year_of_release to integer (only after dropping nulls)
df = df[df['year_of_release'].notna()]
df['year_of_release'] = df['year_of_release'].astype(int)

# 6. Drop duplicates (same game, platform, and release year)
df_dedup = df.drop_duplicates(
    subset=['name_clean', 'platform', 'year_of_release'],
    keep='first'
).copy()

print(f"Dataset after eliminating duplicates: {df_dedup.shape}")

# 7. Sanity check
assert df_dedup.duplicated(subset=['name_clean', 'platform', 'year_of_release']).sum() == 0


In [None]:
import sys
print(sys.executable)


In [None]:
!pip install fuzzywuzzy python-Levenshtein


In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Example: find similar titles to one input
process.extract("pokemon", df_dedup['name_clean'].unique(), limit=30)


In [None]:
from fuzzywuzzy import process

franchises = ['pokemon', 'fifa', 'call of duty', 'mario', 'zelda', 'nba', 
              'lego', 'assassins creed', 'need for speed', 'resident evil']

matches = {}

for keyword in franchises:
    results = process.extract(keyword, df_dedup['name_clean'].unique(), limit=30)
    matches[keyword] = results


In [None]:
for franchise, titles in matches.items():
    print(f"\nTop matches for: {franchise.upper()}")
    for title, score in titles:
        print(f"- {title} ({score})")


In [None]:
# Null check
print("Missing values:\n", df_dedup.isnull().sum())

# Duplicate check
dup_check = df_dedup.duplicated(subset=['name_clean', 'platform', 'year_of_release'])
print(f"\nDuplicates: {dup_check.sum()}")


In [None]:
df_dedup = df_dedup.dropna(subset=['name', 'genre', 'name_clean'])


In [None]:
df['genre'] = df['genre'].replace({
    'platform': 'platformer',
    'role-playing': 'rpg'
})

rare_genres = ['party', 'mmo', 'board game', 'education', 'sandbox']
df['genre'] = df['genre'].replace(rare_genres, 'other')

In [None]:
df_dedup.to_csv('C:/Users/PcVIP/Desktop/Bootcamp/Week7/Weekly_project_ml/video_game_sales/data/clean/video_game_sales_final_cleaned.csv', index=False)
