In [4]:
# Import required libraries
import pandas as pd
from google.colab import drive

# 1. Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# -----------------------------------
# MERGING THE DATASETS
# -----------------------------------

# 2. Load the datasets
steam_games = pd.read_csv("/content/drive/My Drive/DSA210/project/steam_store_games.csv")  # Steam dataset
howlongtobeat_data = pd.read_csv("/content/drive/My Drive/DSA210/project/howlongtobeat_data.csv")  # HLTB dataset

# 3. Check the datasets
print("First 5 rows of the Steam dataset:")
print(steam_games.head())
print("\nFirst 5 rows of the HLTB dataset:")
print(howlongtobeat_data.head())

# 4. Merge the datasets using 'name' column from Steam and 'title' column from HLTB
merged_df = pd.merge(steam_games, howlongtobeat_data, how='inner', left_on='name', right_on='title')

# 5. Check the merged dataset
print("\nFirst 5 rows of the merged dataset:")
print(merged_df.head())

# 6. Save the merged dataset to a new CSV file
merged_df.to_csv('/content/drive/My Drive/DSA210/project/merged_data.csv', index=False)
print("\nMerged dataset saved as 'merged_data.csv'.")

# -----------------------------------
# CLEANING THE MERGED DATASET
# -----------------------------------

# 7. Remove all specified columns
columns_to_drop = [
    'appid', 'english', 'developer', 'publisher', 'genres_x',
    'achievements', 'id', 'title', 'type', 'genres_y',
    'release_na', 'release_eu', 'release_jp', 'categories',
    'coop', 'versus', 'publishers',
    'platforms_x', 'platforms_y'
]
merged_df = merged_df.drop(columns=columns_to_drop, errors='ignore')

# 8. Rename steamspy_tags to genres
merged_df = merged_df.rename(columns={'steamspy_tags': 'genres'})

# 9. Ensure time-related columns are numeric (convert strings to float if needed)
time_columns = ['average_playtime', 'median_playtime', 'main_story', 'main_plus_extras', 'completionist', 'all_styles']
for col in time_columns:
    merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')

# 10. Convert average_playtime and median_playtime from minutes to hours and round to 2 decimal places
merged_df['average_playtime'] = (merged_df['average_playtime'] / 60).round(2)
merged_df['median_playtime'] = (merged_df['median_playtime'] / 60).round(2)

# 11. Round HLTB time-related columns (main_story, main_plus_extras, completionist, all_styles) to 2 decimal places
hltb_time_columns = ['main_story', 'main_plus_extras', 'completionist', 'all_styles']
for col in hltb_time_columns:
    merged_df[col] = merged_df[col].round(2)

# 12. Verify that time-related columns are in hours and rounded
print("\nSample Time-Related Columns (to verify units and rounding):")
# Use format to display values with 2 decimal places in the output (for display purposes only)
formatted_df = merged_df.copy()
for col in time_columns:
    formatted_df[col] = formatted_df[col].apply(lambda x: "{:.2f}".format(x) if pd.notnull(x) else "NaN")
print(formatted_df[time_columns].head())

# 13. Inspect the updated dataset
print("\nUpdated Dataset - First 5 Rows:")
print(formatted_df.head())
print("\nDataset Shape:", merged_df.shape)
print("\nColumns in the Dataset:", merged_df.columns.tolist())

# 14. Save the updated dataset to a new CSV file with formatted floats
merged_df.to_csv('/content/drive/My Drive/DSA210/project/clean_merged_data.csv', index=False, float_format='%.2f')
print("\nUpdated dataset saved as 'clean_merged_data.csv'.")

Mounted at /content/drive
First 5 rows of the Steam dataset:
   appid                       name release_date  english         developer  \
0     10             Counter-Strike   2000-11-01        1             Valve   
1     20      Team Fortress Classic   1999-04-01        1             Valve   
2     30              Day of Defeat   2003-05-01        1             Valve   
3     40         Deathmatch Classic   2001-06-01        1             Valve   
4     50  Half-Life: Opposing Force   1999-11-01        1  Gearbox Software   

  publisher          platforms  required_age  \
0     Valve  windows;mac;linux             0   
1     Valve  windows;mac;linux             0   
2     Valve  windows;mac;linux             0   
3     Valve  windows;mac;linux             0   
4     Valve  windows;mac;linux             0   

                                          categories  genres  \
0  Multi-player;Online Multi-Player;Local Multi-P...  Action   
1  Multi-player;Online Multi-Player;Local Multi