In [9]:
import pyspark
import pandas

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import findspark
import psutil

# 1. Properly initialize Spark environment
findspark.init()

# 2. Check available memory before starting
mem = psutil.virtual_memory()
print(f"Available memory: {mem.available/1024**3:.1f} GB")

# 3. Configure Spark with memory settings
spark = SparkSession.builder \
    .appName("Combine Player Datasets") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.sql.shuffle.partitions", "100") \
    .config("spark.network.timeout", "600s") \
    .config("spark.driver.port", "7078") \  # Explicit port
    .getOrCreate()

# 4. Verify successful connection
try:
    print(spark.version)
    print("Spark connection successful!")
except Exception as e:
    print(f"Spark connection failed: {str(e)}")
    exit(1)

# Rest of your data processing code...
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Combine Player Datasets") \
    .getOrCreate()

# Load the three datasets
# Assuming the datasets are in CSV format with headers
# Replace the file paths with your actual data sources
player_valuations = spark.read.csv("data/raw/kaggle/player-scores/player_valuations.csv", header=True, inferSchema=True)
players_stats = spark.read.csv("data/raw/kaggle/player-scores/players.csv", header=True, inferSchema=True)
transfers = spark.read.csv("data/raw/kaggle/player-scores/transfers.csv", header=True, inferSchema=True)

# Display the schema of each dataset to understand their structure
print("Player Valuations Schema:")
player_valuations.printSchema()

print("Players Stats Schema:")
players_stats.printSchema()

print("Transfers Schema:")
transfers.printSchema()

# Get column names from each dataset to identify duplicates
valuation_cols = set(player_valuations.columns)
stats_cols = set(players_stats.columns)
transfers_cols = set(transfers.columns)

# Identify duplicate columns (excluding player_id which we use for joining)
duplicate_cols_stats = [col for col in stats_cols if col in valuation_cols and col != "player_id"]
duplicate_cols_transfers = [col for col in transfers_cols if (col in valuation_cols or col in stats_cols) and col != "player_id"]

print(f"Duplicate columns in players_stats: {duplicate_cols_stats}")
print(f"Duplicate columns in transfers: {duplicate_cols_transfers}")

# Handle duplicate columns by renaming them in the secondary datasets
players_stats_renamed = players_stats
transfers_renamed = transfers

# Rename duplicate columns in players_stats to avoid conflicts
for dup_col in duplicate_cols_stats:
    players_stats_renamed = players_stats_renamed.withColumnRenamed(dup_col, f"stats_{dup_col}")

# Rename duplicate columns in transfers to avoid conflicts
for dup_col in duplicate_cols_transfers:
    transfers_renamed = transfers_renamed.withColumnRenamed(dup_col, f"transfers_{dup_col}")

# Perform joins with renamed columns
combined_data = player_valuations.join(
    players_stats_renamed,
    player_valuations["player_id"] == players_stats_renamed["player_id"],
    "inner"
).join(
    transfers_renamed,
    player_valuations["player_id"] == transfers_renamed["player_id"],
    "inner"
)

# Drop duplicate player_id columns
combined_data = combined_data.drop(players_stats_renamed["player_id"], transfers_renamed["player_id"])

# Show a preview of the combined dataset
print("Combined Dataset Preview:")
combined_data.show(5)

# Count the number of players in each dataset and in the combined dataset
print(f"Number of players in valuations dataset: {player_valuations.count()}")
print(f"Number of players in stats dataset: {players_stats.count()}")
print(f"Number of players in transfers dataset: {transfers.count()}")
print(f"Number of players in combined dataset: {combined_data.count()}")

# Use this instead:
combined_data.coalesce(1).write.option("header", True).mode("overwrite").csv("data/processed/combined_players")

# Or even better, to get a specific filename:
combined_data.toPandas().to_csv("data/processed/combined_kaggle.csv", index=False)

# Stop the Spark session
spark.stop()

SyntaxError: unexpected character after line continuation character (3964876134.py, line 23)

In [8]:
# Revised code with memory optimizations
from pyspark.sql import Window
from pyspark.sql.functions import (
    collect_list, col, concat_ws, when, lit, first,
    isnan, isnull, row_number
)
from pyspark.sql.types import NumericType

# 1. Tune Spark configuration first (add these at Spark session initialization)
# spark = SparkSession.builder \
#     .config("spark.driver.memory", "8g") \
#     .config("spark.executor.memory", "4g") \
#     .getOrCreate()

# 2. Clean columns with memory-efficient operations
def clean_field(df, column_name):
    if column_name not in df.columns:
        return df
    
    data_type = df.schema[column_name].dataType
    is_numeric = isinstance(data_type, NumericType)
    
    condition = col(column_name).isNull() | (col(column_name) == "")
    if is_numeric:
        condition = condition | isnan(col(column_name))
    
    return df.withColumn(
        column_name,
        when(condition, lit("N/A")).otherwise(col(column_name))
    )

print("Cleaning data...")
for column in combined_data.columns:
    combined_data = clean_field(combined_data, column)

# 3. Optimize grouping and joining operations
compressed_data_temp = combined_data.select("player_id").distinct()

# Use window functions instead of multiple joins for static columns
window_spec = Window.partitionBy("player_id").orderBy(col("date").desc())

static_columns_aggregated = combined_data.withColumn(
    "row_num", row_number().over(window_spec)
).filter("row_num = 1").drop("row_num")

compressed_data_temp = compressed_data_temp.join(
    static_columns_aggregated.select(
        "player_id", *static_columns
    ),
    on="player_id",
    how="left"
)

# 4. Streamline history collection with limit
MAX_HISTORY_ENTRIES = 5  # Adjust based on memory constraints
for column in change_tracking_columns:
    if column in combined_data.columns:
        history_window = Window.partitionBy("player_id").orderBy(col("date").desc())
        limited_history = combined_data.withColumn(
            "rank", row_number().over(history_window)
        ).filter(f"rank <= {MAX_HISTORY_ENTRIES}").drop("rank")
        
        value_history = limited_history.groupBy("player_id").agg(
            concat_ws(", ", collect_list(col(column))).alias(f"{column}_history")
        )
        
        compressed_data_temp = compressed_data_temp.join(
            value_history,
            on="player_id",
            how="left"
        )

# 5. Optimized data quality check
print("\nRunning efficient data quality checks...")
compressed_data_temp.cache()  # Cache for multiple actions
total_count = compressed_data_temp.count()

# Batch null checks using single computation
null_counts = {}
for column in compressed_data_temp.columns:
    null_counts[column] = compressed_data_temp.filter(
        col(column).isNull() | (col(column) == "")
    ).count()

print("\nData Quality Report:")
for col_name, count in null_counts.items():
    if count > 0:
        print(f"Column '{col_name}' has {count} null/empty values ({count/total_count*100:.2f}%)")

# 6. Safe write operation
print("Saving data...")
try:
    # Write with multiple partitions
    (compressed_data_temp
     .repartition(10)  # Adjust based on dataset size
     .write
     .option("header", True)
     .mode("overwrite")
     .csv("data/processed/compressed_players")
    )
    print("Successfully wrote partitioned data")
except Exception as e:
    print(f"Write failed: {str(e)}")
    print("Consider using Parquet format for better compression")

compressed_data_temp.unpersist()
print("Operation complete")

Cleaning data...


AssertionError: Undefined error message parameter for error class: CANNOT_PARSE_DATATYPE. Parameters: {'error': '[WinError 10061] No connection could be made because the target machine actively refused it'}