In [1]:
import pandas as pd
import numpy as np
import re
import html
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import re
import html
import json
import os
from bs4 import BeautifulSoup
import time
import warnings
warnings.filterwarnings('ignore')

# Selenium imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException

# For language detection
try:
    from langdetect import detect, DetectorFactory
    # Set seed for consistent language detection
    DetectorFactory.seed = 42
    LANGDETECT_AVAILABLE = True
except ImportError:
    LANGDETECT_AVAILABLE = False
    print("langdetect not installed. Install with: pip install langdetect")

def setup_selenium_driver():
    """Setup and return a Selenium WebDriver"""
    try:
        # Try Chrome first
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')  # Run in headless mode
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome(options=options)
        return driver
    except Exception as e:
        print(f"Chrome driver failed: {e}")
        try:
            # Try Firefox as fallback
            options = webdriver.FirefoxOptions()
            options.add_argument('--headless')
            driver = webdriver.Firefox(options=options)
            return driver
        except Exception as e:
            print(f"Firefox driver failed: {e}")
            return None

def translate_with_selenium(text, driver, translation_mapper=None, source_lang='auto', target_lang='en', max_retries=3):
    """
    Translate text using Google Translate via Selenium
    
    Parameters:
    ----------
    text : str
        Text to translate
    driver : WebDriver
        Selenium WebDriver instance
    translation_mapper : dict, optional
        Dictionary mapping source text to translations
    source_lang : str
        Source language code (default: 'auto' for auto-detection)
    target_lang : str
        Target language code (default: 'en' for English)
    max_retries : int
        Maximum number of retry attempts
    
    Returns:
    -------
    str
        Translated text
    """
    if not text or not isinstance(text, str) or len(text.strip()) == 0:
        return text
    
    # Check if we already have a translation in the mapper
    if translation_mapper is not None and text in translation_mapper:
        return translation_mapper[text]
    
    # Truncate very long text to avoid issues
    if len(text) > 5000:
        text = text[:5000] + "..."
    
    # URL with language parameters
    url = f"https://translate.google.com/?sl={source_lang}&tl={target_lang}&op=translate"
    
    for attempt in range(max_retries):
        try:
            # Navigate to Google Translate
            driver.get(url)
            time.sleep(2)  # Give page time to load
            
            # Find and fill the source text area
            source_textarea = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "textarea"))
            )
            source_textarea.clear()
            source_textarea.send_keys(text)
            
            # Wait for translation to appear
            time.sleep(3)  # Initial wait for processing
            
            # Try different possible selectors for translated text
            selectors = [
                "div[class*='result']", 
                "span[jsname='W297wb']",
                "div[jsname='W297wb']",
                "div[class*='translation']"
            ]
            
            translated_text = None
            for selector in selectors:
                try:
                    # Wait for the element to be visible
                    element = WebDriverWait(driver, 5).until(
                        EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
                    )
                    translated_text = element.text
                    if translated_text and len(translated_text.strip()) > 0:
                        break
                except:
                    continue
            
            if translated_text and len(translated_text.strip()) > 0:
                # Add to translation mapper if provided
                if translation_mapper is not None and text != translated_text:
                    translation_mapper[text] = translated_text
                return translated_text
            
            # If we couldn't find the translation, wait longer and retry
            time.sleep(2)
            
        except Exception as e:
            print(f"Translation error (attempt {attempt+1}/{max_retries}): {e}")
            time.sleep(2)  # Wait before retrying
    
    # If all attempts failed, return original text
    print(f"Failed to translate text after {max_retries} attempts")
    return text

def extract_list_items(text):
    """Extract items from a string representation of a list"""
    if not isinstance(text, str):
        return []
    
    # Use regex to extract items between quotes within brackets
    pattern = r"'([^']+)'|\"([^\"]+)\""
    matches = re.findall(pattern, text)
    
    # Flatten and clean matches
    items = []
    for match in matches:
        items.extend([m for m in match if m])
    
    return items

def rebuild_list_string(items):
    """Convert a list of items back to a string representation"""
    if not items:
        return "[]"
    
    items_str = ", ".join([f"'{item}'" for item in items])
    return f"[{items_str}]"

def clean_html(text):
    """Remove HTML tags and entities from text"""
    if not isinstance(text, str) or not text.strip():
        return ""
    
    # Remove HTML tags
    text = BeautifulSoup(str(text), 'html.parser').get_text()
    
    # Decode HTML entities
    text = html.unescape(text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def detect_language_safe(text, default='unknown'):
    """Safely detect language of text"""
    if not LANGDETECT_AVAILABLE:
        return default
        
    if not isinstance(text, str) or len(text.strip()) < 20:
        return default
        
    try:
        return detect(text)
    except:
        return default

def load_translation_mapper(filepath='translation_mapper.json'):
    """Load translation mapper from file if it exists"""
    if os.path.exists(filepath):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                mapper = json.load(f)
            print(f"Loaded {len(mapper)} translations from {filepath}")
            return mapper
        except Exception as e:
            print(f"Error loading translation mapper: {e}")
    
    # Return empty mapper if file doesn't exist or there was an error
    return {}

def save_translation_mapper(mapper, filepath='translation_mapper.json'):
    """Save translation mapper to file"""
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(mapper, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(mapper)} translations to {filepath}")
    except Exception as e:
        print(f"Error saving translation mapper: {e}")

def normalize_steam_data(file_path, output_path=None, target_lang='en', max_rows=None, 
                         mapper_path='translation_mapper.json'):
    """
    Normalize language in Steam game dataset to English
    with special handling for categories and genres
    
    Parameters:
    ----------
    file_path : str
        Path to the CSV file containing Steam game data
    output_path : str, optional
        Path to save the normalized data, if None, returns the DataFrame
    target_lang : str
        Target language code (default: 'en' for English)
    max_rows : int, optional
        Maximum number of rows to process (for testing)
    mapper_path : str
        Path to save/load the translation mapper
    
    Returns:
    -------
    pd.DataFrame or None
        Normalized dataframe if output_path is None, otherwise None
    """
    # Check for required libraries
    if not LANGDETECT_AVAILABLE:
        print("Error: langdetect is required for language detection")
        print("Install with: pip install langdetect")
        return None
    
    # Load existing translation mapper if available
    translation_mapper = load_translation_mapper(mapper_path)
    
    # Setup Selenium WebDriver
    driver = setup_selenium_driver()
    if driver is None:
        print("Error: Failed to initialize Selenium WebDriver")
        return None
    
    try:
        print(f"Loading data from {file_path}...")
        
        # Load data
        df = pd.read_csv(file_path)
        
        # Limit rows if specified (for testing)
        if max_rows is not None and max_rows > 0:
            df = df.head(max_rows)
            print(f"Limited to {max_rows} rows for testing")
        
        print(f"Processing {len(df)} rows with {len(df.columns)} columns")
        
        # Make a copy to avoid modifying the original
        normalized_df = df.copy()
        
        # 1. Process categories and genres columns first
        list_columns = ['categories', 'genres']
        
        # Check which ones actually exist in the dataframe
        existing_list_columns = [col for col in list_columns if col in normalized_df.columns]
        
        if existing_list_columns:
            print(f"Processing list columns: {existing_list_columns}")
            
            for col in existing_list_columns:
                print(f"Processing column: {col}")
                
                # Extract items from string representations
                all_items = []
                for idx, row in normalized_df.iterrows():
                    items = extract_list_items(row[col])
                    all_items.extend(items)
                
                # Get unique items
                unique_items = list(set(all_items))
                print(f"Found {len(unique_items)} unique items in {col}")
                
                # Detect language of each item and translate non-English items
                for item in unique_items:
                    if item not in translation_mapper:
                        # For categories and genres, we need to detect the language
                        # But since they are usually short, we'll translate all non-English looking ones
                        try:
                            # Look for non-ASCII characters as a simple heuristic
                            if any(ord(c) > 127 for c in item):
                                print(f"Translating: {item}")
                                translation = translate_with_selenium(
                                    item, driver, 
                                    translation_mapper=translation_mapper,
                                    source_lang='auto',
                                    target_lang=target_lang
                                )
                                print(f"  -> {translation}")
                                time.sleep(1)  # Brief pause to avoid rate limiting
                        except Exception as e:
                            print(f"Error translating '{item}': {e}")
                
                # Periodically save the mapper
                save_translation_mapper(translation_mapper, mapper_path)
                
                # Now apply translations to each row
                for idx, row in normalized_df.iterrows():
                    items = extract_list_items(row[col])
                    if items:
                        # Translate each item using the mapper
                        translated_items = [translation_mapper.get(item, item) for item in items]
                        # Create a new column with the translated items
                        new_col_name = f"{col}_en"
                        normalized_df.loc[idx, new_col_name] = rebuild_list_string(translated_items)
                        
                        # Print progress periodically
                        if idx % 100 == 0 or idx == 0 or idx == len(normalized_df) - 1:
                            print(f"Processed row {idx+1}/{len(normalized_df)} for {col}")
        
        # 2. Process text columns (with potential HTML)
        text_columns = []
        for col in normalized_df.columns:
            # Skip columns we just created
            if col.endswith('_en'):
                continue
                
            # Check column name for text indicators
            if any(marker in col.lower() for marker in ['description']):
                text_columns.append(col)
        
        if text_columns:
            print(f"\nProcessing text columns: {text_columns}")
            
            for col in text_columns:
                print(f"Cleaning HTML in '{col}'...")
                
                # Clean HTML
                normalized_df[f"{col}_cleaned"] = normalized_df[col].apply(clean_html)
                
                # Detect language for each row
                print(f"Detecting language for {col}...")
                
                normalized_df[f"{col}_language"] = normalized_df[f"{col}_cleaned"].apply(detect_language_safe)
                
                # Count languages
                lang_counts = normalized_df[f"{col}_language"].value_counts()
                print(f"Detected languages: {dict(lang_counts)}")
                
                # Translate non-English text
                non_english_rows = normalized_df[normalized_df[f"{col}_language"] != 'en']
                print(f"Found {len(non_english_rows)} rows not in English for {col}")
                
                if len(non_english_rows) > 0:
                    # Create a new column for translated text
                    normalized_df[f"{col}_en"] = normalized_df[f"{col}_cleaned"]
                    
                    # Translate non-English rows
                    for idx in non_english_rows.index:
                        source_lang = normalized_df.loc[idx, f"{col}_language"]
                        if source_lang not in ['en', 'unknown']:
                            text = normalized_df.loc[idx, f"{col}_cleaned"]
                            
                            # Check if this exact text is already in our mapper
                            if text in translation_mapper:
                                translated_text = translation_mapper[text]
                            else:
                                # Otherwise translate it
                                translated_text = translate_with_selenium(
                                    text, driver, 
                                    translation_mapper=translation_mapper,
                                    source_lang=source_lang, 
                                    target_lang=target_lang
                                )
                            
                            # Store the translation
                            normalized_df.loc[idx, f"{col}_en"] = translated_text
                            
                            # Print progress periodically
                            if idx % 10 == 0 or idx == non_english_rows.index[0] or idx == non_english_rows.index[-1]:
                                print(f"Translated row {idx}")
                                # Periodically save the mapper
                                save_translation_mapper(translation_mapper, mapper_path)
                            
                            # Brief pause to avoid rate limiting
                            time.sleep(1)
        
        # Save the final translation mapper
        save_translation_mapper(translation_mapper, mapper_path)
        print(f"Final translation mapper contains {len(translation_mapper)} entries")
        
        # Save or return the normalized data
        if output_path:
            normalized_df.to_csv(output_path, index=False)
            print(f"Saved normalized data to {output_path}")
            return None
        else:
            return normalized_df
            
    finally:
        # Always close the driver
        if driver:
            driver.quit()
            print("Selenium WebDriver closed")

In [13]:
for i, file in enumerate(os.listdir('tags')):
    # Normalize language to English using Selenium with dynamic mapper
    input_filepath = f"tags/{file}"
    output_filepath = f"translated_tags/english_steam_games_{i}.csv"
    
    # Check if output file already exists, skip if it does
    if os.path.exists(output_filepath):
        print(f"Skipping {input_filepath}, output file {output_filepath} already exists")
        continue
        
    normalize_steam_data(
        file_path=input_filepath, 
        output_path=output_filepath,
        mapper_path="translation_mapper.json",
        max_rows=None  # Set a value for testing
    )

Skipping tags/steamspy_page_29_tags_final.csv, output file translated_tags/english_steam_games_0.csv already exists
Skipping tags/steamspy_page_45_tags_final.csv, output file translated_tags/english_steam_games_1.csv already exists
Skipping tags/steamspy_page_50_tags_final.csv, output file translated_tags/english_steam_games_2.csv already exists
Skipping tags/steamspy_page_7_tags_final.csv, output file translated_tags/english_steam_games_3.csv already exists
Skipping tags/steamspy_page_21_tags_final.csv, output file translated_tags/english_steam_games_4.csv already exists
Skipping tags/steamspy_page_34_tags_final.csv, output file translated_tags/english_steam_games_5.csv already exists
Skipping tags/steamspy_page_8_tags_final.csv, output file translated_tags/english_steam_games_6.csv already exists
Skipping tags/steamspy_page_0_tags_final.csv, output file translated_tags/english_steam_games_7.csv already exists
Skipping tags/steamspy_page_42_tags_final.csv, output file translated_tags/

In [16]:
translated_tags = []
for file in os.listdir('translated_tags'):
    input_filepath = f"translated_tags/{file}"
    translated_tag = pd.read_csv(input_filepath)
    translated_tags.append(translated_tag)
translated_tags_df = pd.concat(translated_tags, ignore_index=True)

In [21]:
unique_data = translated_tags_df.drop_duplicates(subset=['appid'], keep='first', ignore_index=True)

In [23]:
unique_data.to_csv('consolidated_tags.csv')

In [24]:
unique_data

Unnamed: 0,appid,categories,genres,short_description,detailed_description,categories_en,genres_en,short_description_cleaned,short_description_language,short_description_en,detailed_description_cleaned,detailed_description_language,detailed_description_en
0,8210,"['Single-player', 'Family Sharing']","['Action', 'Adventure']",Sam &amp; Max: Episode 2 - Situation: Comedy -...,<strong>Sam &amp; Max: Episode 2 - Situation: ...,"['Single-player', 'Family Sharing']","['Action', 'Adventure']",Sam & Max: Episode 2 - Situation: Comedy - Tal...,en,Sam & Max: Episode 2 - Situation: Comedy - Tal...,Sam & Max: Episode 2 - Situation: Comedy - Tal...,en,Sam & Max: Episode 2 - Situation: Comedy - Tal...
1,37800,"['Single-player', 'Steam Achievements', 'Steam...","['Indie', 'Casual']",Experience a new dimension of puzzle action ga...,\n\t\t\t\t\t<p>Experience a new dimension of p...,"['Single-player', 'Steam Achievements', 'Steam...","['Indie', 'Casual']",Experience a new dimension of puzzle action ga...,en,Experience a new dimension of puzzle action ga...,Experience a new dimension of puzzle action ga...,en,Experience a new dimension of puzzle action ga...
2,98900,"['Single-player', 'Steam Achievements', 'Famil...","['Action', 'Casual', 'Indie', 'Strategy']",Players measure stamina with a never-ending st...,<strong>Alien Hallway</strong> is a totally ne...,"['Single-player', 'Steam Achievements', 'Famil...","['Action', 'Casual', 'Indie', 'Strategy']",Players measure stamina with a never-ending st...,en,Players measure stamina with a never-ending st...,Alien Hallway is a totally new action- strateg...,en,Alien Hallway is a totally new action- strateg...
3,262610,['Single-player'],['Audio Production'],Liquid Rhythm is a wildly innovative beat gene...,<h1>Liquid Rhythm 1.4.5</h1><p>We're excited t...,['Single-player'],['Audio Production'],Liquid Rhythm is a wildly innovative beat gene...,en,Liquid Rhythm is a wildly innovative beat gene...,Liquid Rhythm 1.4.5We're excited to announce t...,en,Liquid Rhythm 1.4.5We're excited to announce t...
4,264020,"['Single-player', 'Steam Achievements', 'Steam...",['Utilities'],Geekbench is a cross-platform processor benchm...,<strong>Find out how fast your processor is wi...,"['Single-player', 'Steam Achievements', 'Steam...",['Utilities'],Geekbench is a cross-platform processor benchm...,en,Geekbench is a cross-platform processor benchm...,Find out how fast your processor is with Geekb...,en,Find out how fast your processor is with Geekb...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47658,3204650,"['Single-player', 'Steam Achievements', 'Steam...","['Adventure', 'Indie']",HANGAR 8 is a psychological horror set within ...,"<p class=""bb_paragraph"">You wake up gasping fo...","['Single-player', 'Steam Achievements', 'Steam...","['Adventure', 'Indie']",HANGAR 8 is a psychological horror set within ...,en,HANGAR 8 is a psychological horror set within ...,"You wake up gasping for air, but your lungs bu...",en,"You wake up gasping for air, but your lungs bu..."
47659,3221490,"['Single-player', 'Family Sharing']",['Simulation'],“Can We Be Three?” is a playful and heartfelt ...,Yolanda Rogers' career just hit rock bottom. H...,"['Single-player', 'Family Sharing']",['Simulation'],“Can We Be Three?” is a playful and heartfelt ...,en,“Can We Be Three?” is a playful and heartfelt ...,Yolanda Rogers' career just hit rock bottom. H...,en,Yolanda Rogers' career just hit rock bottom. H...
47660,3261630,"['Single-player', 'Steam Achievements', 'Steam...","['Adventure', 'Casual', 'RPG']",Dark Humor &amp; Satire: Experience a story th...,"<p class=""bb_paragraph"">In HOOD rpg, join Leon...","['Single-player', 'Steam Achievements', 'Steam...","['Adventure', 'Casual', 'RPG']",Dark Humor & Satire: Experience a story that b...,en,Dark Humor & Satire: Experience a story that b...,"In HOOD rpg, join Leonard and Lyzol on their c...",en,"In HOOD rpg, join Leonard and Lyzol on their c..."
47661,3286030,"['Single-player', 'Family Sharing']","['Indie', 'Strategy']",An occult themed turn based strategy game wher...,"<p class=""bb_paragraph""><img class=""bb_img"" sr...","['Single-player', 'Family Sharing']","['Indie', 'Strategy']",An occult themed turn based strategy game wher...,en,An occult themed turn based strategy game wher...,Boardguard is a turn-based strategy roguelike ...,en,Boardguard is a turn-based strategy roguelike ...
