In [None]:
import pandas as pd

df = pd.read_csv('../raw/Video_Games_Sales_as_at_22_Dec_2016.csv')
df.info()
df.head()

In [None]:
df.isnull().sum().sort_values(ascending=False)


In [None]:
df_old = pd.read_csv('../raw/Video_Games_Sales_as_at_22_Dec_2016.csv')

df_old.drop(columns=[
    'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer'
], inplace=True)


In [None]:
df_old.head()


In [None]:
df_old.columns


In [None]:
df_old.info()


In [None]:
df_old['Publisher'] = df_old['Publisher'].fillna('Unknown')
df_old['Rating'] = df_old['Rating'].fillna('Unrated')


In [None]:
df_old['Genre'] = df_old['Genre'].str.lower().str.strip()
df_old['Platform'] = df_old['Platform'].str.upper().str.strip()
df_old['Publisher'] = df_old['Publisher'].str.strip()
df_old['Rating'] = df_old['Rating'].str.strip()
df_old['Name'] = df_old['Name'].str.lower().str.strip()


In [None]:
df_old['Year_of_Release'].isnull().sum()


In [None]:
df_old = df_old[df_old['Year_of_Release'].notna()]


In [None]:
df_old['Year_of_Release'] = df_old['Year_of_Release'].astype(int)


In [None]:
df_old['Genre'].unique()[:10]
df_old['Platform'].unique()[:10]
df_old['Publisher'].unique()[:10]
df_old['Rating'].unique()
df_old['Name'].head(10)


In [None]:
sorted(df_old['Genre'].dropna().unique())
sorted(df_old['Platform'].dropna().unique())
sorted(df_old['Rating'].dropna().unique())


In [None]:
df_old['Genre'].value_counts().head(10)
df_old['Platform'].value_counts().head(10)
df_old['Rating'].value_counts()


In [None]:
rating_map = {
    'E': 'Children',
    'EC': 'Children',
    'E10+': 'Pre-teens',
    'T': 'Teens',
    'M': 'Adults',
    'AO': 'Adults',
    'RP': 'Unrated',
    'K-A': 'Unrated',
    'Unrated': 'Unrated'
}

df_old['Rating_Grouped'] = df_old['Rating'].map(rating_map).fillna('Unrated')


In [None]:
df_old[['Rating', 'Rating_Grouped']].head(5)
df_old['Rating_Grouped'].unique()


In [None]:
df_old['Platform'].unique()


In [None]:
df_old['Genre'].unique()


In [None]:
df_new = pd.read_csv('..//raw/2024_videogames_sales.csv')
df_new.head()
df_new.info()


In [None]:
df_new.rename(columns={
    'title': 'Name',
    'console': 'Platform',
    'genre': 'Genre',
    'publisher': 'Publisher',
    'total_sales': 'Global_Sales',
    'na_sales': 'NA_Sales',
    'jp_sales': 'JP_Sales',
    'pal_sales': 'EU_Sales',
    'other_sales': 'Other_Sales',
    'release_date': 'Release_Date'
}, inplace=True)

In [None]:
df_new['Release_Date'] = pd.to_datetime(df_new['Release_Date'], errors='coerce')
df_new = df_new[df_new['Release_Date'].notna()].copy()
df_new['Year_of_Release'] = df_new['Release_Date'].dt.year.astype(int)



In [None]:
df_new['Genre'] = df_new['Genre'].str.lower().str.strip()
df_new['Platform'] = df_new['Platform'].str.upper().str.strip()
df_new['Publisher'] = df_new['Publisher'].fillna('Unknown').str.strip()
df_new['Name'] = df_new['Name'].str.lower().str.strip()

In [None]:
df_new = df_new[[
    'Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher',
    'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'
]]

In [None]:
df_new['Year_of_Release'].describe()
df_new['Year_of_Release'].value_counts().sort_index()


In [None]:
df_new = df_new[df_new['Year_of_Release'].between(1980, 2024)]


In [None]:
df_new.isnull().sum().sort_values(ascending=False)


In [None]:
sorted(df_new['Genre'].dropna().unique())
sorted(df_old['Genre'].dropna().unique())


In [None]:
genre_map = {
    'action-adventure': 'action',
    'fighting': 'action',
    'misc': 'other',
    'platform': 'platformer',
    'shooter': 'shooter',
    'simulation': 'simulation',
    'sports': 'sports',
    'racing': 'racing',
    'role-playing': 'rpg',
    'puzzle': 'puzzle',
    'strategy': 'strategy',
}

df_new['Genre'] = df_new['Genre'].replace(genre_map)


In [None]:
df_new.duplicated(subset=['Name', 'Platform', 'Year_of_Release']).sum()


In [None]:
sales_cols = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
df_new[sales_cols] = df_new[sales_cols].fillna(0)


In [None]:
df_new

In [None]:
df_combined = pd.concat([df_old, df_new], ignore_index=True)


In [None]:
df_combined.drop_duplicates(subset=['Name', 'Platform', 'Year_of_Release'], inplace=True)


In [None]:
print(f"Combined dataset shape: {df_combined.shape}")
df_combined.head()


In [None]:
df_combined.to_csv('../clean/combined_before_title_cleaning.csv', index=False)
