In [186]:
import pandas as pd
import ast  # For safely converting string representations of objects
import re   # For regular expressions

# --- 1. LOAD THE PREVIOUSLY CLEANED DATA ---
print("Step 1: Loading the cleaned and exploded dataset...")
# Load the CSV file you provided
df = pd.read_csv('steam_data_cleaned_exploded.csv')


# --- 2. APPLYING FINAL TWEAKS & TRANSFORMATIONS ---

print("Step 2: Applying final cleaning and feature engineering...")

# --- Task: Clean 'controller_support' to boolean ---
# If the value is 'full', it's True. Otherwise (including NaN), it's False.
df['controller_support'] = (df['controller_support'] == 'full')


# --- Task: Clean and Engineer Language Features ---
# First, we'll combine the two language columns into one, prioritizing 'supported_languages'
df['languages_combined'] = df['supported_languages'].fillna(df['languages'])

# Helper function to strip HTML tags
def clean_html(text):
    if isinstance(text, str):
        return re.sub(r'<[^>]+>', '', text)
    return '' # Return empty string for NaNs

df['languages_clean'] = df['languages_combined'].apply(clean_html)

# Create a 'language_count' column
df['language_count'] = df['languages_clean'].apply(lambda x: len(x.split(',')) if x else 0)

# Create an 'is_english' boolean column
df['is_english'] = df['languages_clean'].str.contains('English', case=False, na=False)


# --- Task: Extract Achievement Count ---
# Helper function to safely parse the dictionary-like string and get the total
def get_achievement_total(dict_str):
    try:
        ach_dict = ast.literal_eval(dict_str)
        if isinstance(ach_dict, dict):
            return ach_dict.get('total', 0)
    except (ValueError, SyntaxError, TypeError, NameError):
        return 0 # Return 0 if NaN or malformed
    return 0

df['achievement_count'] = df['achievements'].apply(get_achievement_total)


# --- 3. FINAL COLUMN DROPS ---
print("Step 3: Dropping all unnecessary and old columns...")
columns_to_drop = [
    'dlc', 'fullgame', 'demos',         # Columns you asked to remove
    'achievements',                     # Original messy achievements column
    'supported_languages', 'languages', # Original messy language columns
    'languages_combined', 'languages_clean' # Helper columns
]
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')


print("\n--- FINAL CLEANING COMPLETE ---")

# --- 4. FINAL DATAFRAME PREVIEW ---
print("\nFinal DataFrame Information:")
df.info()

print("\nFinal DataFrame Head:")
pd.set_option('display.max_columns', None)
display(df.head())


# --- 5. SAVE THE FINAL CLEAN DATA ---
print("\nStep 5: Saving the final clean data to 'steam_data_final_version.csv'...")
df.to_csv('steam_data_final_version.csv', index=False)
print("Done! This is the final file for your project.")

Step 1: Loading the cleaned and exploded dataset...
Step 2: Applying final cleaning and feature engineering...
Step 3: Dropping all unnecessary and old columns...

--- FINAL CLEANING COMPLETE ---

Final DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81311 entries, 0 to 81310
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   appid                 81311 non-null  int64  
 1   required_age          81311 non-null  float64
 2   is_free               81311 non-null  bool   
 3   controller_support    81311 non-null  bool   
 4   developer             81311 non-null  object 
 5   publisher             81311 non-null  object 
 6   name                  81302 non-null  object 
 7   positive              81311 non-null  int64  
 8   negative              81311 non-null  int64  
 9   average_forever       81311 non-null  int64  
 10  average_2weeks        81311 non-null  int64  
 1

Unnamed: 0,appid,required_age,is_free,controller_support,developer,publisher,name,positive,negative,average_forever,average_2weeks,median_forever,median_2weeks,genre,ccu,genre.1,owners_estimated_avg,price_usd,language_count,is_english,achievement_count
0,10,0.0,False,False,['Valve'],['Valve'],Counter-Strike,124534,3339,17612,709,317,26,Action,14923,Action,15000000,9.99,8,True,0
1,20,0.0,False,False,['Valve'],['Valve'],Team Fortress Classic,3318,633,277,15,62,15,Action,87,Action,7500000,4.99,9,True,0
2,30,0.0,False,False,['Valve'],['Valve'],Day of Defeat,3416,398,187,0,34,0,Action,130,Action,7500000,4.99,5,True,0
3,40,0.0,False,False,['Valve'],['Valve'],Deathmatch Classic,1273,267,258,0,184,0,Action,4,Action,7500000,4.99,9,True,0
4,50,0.0,False,False,['Gearbox Software'],['Valve'],Half-Life: Opposing Force,5250,288,624,0,415,0,Action,71,Action,7500000,4.99,4,True,0



Step 5: Saving the final clean data to 'steam_data_final_version.csv'...
Done! This is the final file for your project.
