### Spark 객체 생성

In [30]:
from pyspark.sql import SparkSession

In [31]:
MAX_MEMORY= "5g" # `OutofMemeory` 문제를 방지하기 위함.
spark = SparkSession.builder.appName("soccer_price_prediction")\
        .config("spark.executor.memory", MAX_MEMORY)\
        .config("spark.driver.memory", MAX_MEMORY)\
        .getOrCreate()

### 데이터 파싱

In [32]:
TRIP_FILES = "/Users/dongwoo/ml-soccer-player-prediction/data/reg_data.csv"
player_df = spark.read.csv(f"file:///{TRIP_FILES}", inferSchema=True, header=True)
player_df.printSchema()

root
 |-- preferred_foot: string (nullable = true)
 |-- work_rate_Attaking: integer (nullable = true)
 |-- work_rate_defensive: integer (nullable = true)
 |-- body_type: string (nullable = true)
 |-- international_reputation: integer (nullable = true)
 |-- position: string (nullable = true)
 |-- pace: double (nullable = true)
 |-- shooting: double (nullable = true)
 |-- passing: double (nullable = true)
 |-- dribbling: double (nullable = true)
 |-- defending: double (nullable = true)
 |-- physic: double (nullable = true)
 |-- attacking_crossing: integer (nullable = true)
 |-- attacking_finishing: integer (nullable = true)
 |-- attacking_heading_accuracy: integer (nullable = true)
 |-- attacking_short_passing: integer (nullable = true)
 |-- attacking_volleys: integer (nullable = true)
 |-- skill_dribbling: integer (nullable = true)
 |-- skill_curve: integer (nullable = true)
 |-- skill_fk_accuracy: integer (nullable = true)
 |-- skill_long_passing: integer (nullable = true)
 |-- skill_b

In [33]:
player_df.createOrReplaceTempView("players")

In [39]:
query = """
SELECT 
    *
FROM
    players
"""

In [40]:
data_df = spark.sql(query)

In [41]:
data_df

DataFrame[preferred_foot: string, work_rate_Attaking: int, work_rate_defensive: int, body_type: string, international_reputation: int, position: string, pace: double, shooting: double, passing: double, dribbling: double, defending: double, physic: double, attacking_crossing: int, attacking_finishing: int, attacking_heading_accuracy: int, attacking_short_passing: int, attacking_volleys: int, skill_dribbling: int, skill_curve: int, skill_fk_accuracy: int, skill_long_passing: int, skill_ball_control: int, movement_acceleration: int, movement_sprint_speed: int, movement_agility: int, movement_reactions: int, movement_balance: int, power_shot_power: int, power_jumping: int, power_stamina: int, power_strength: int, power_long_shots: int, mentality_aggression: int, mentality_interceptions: int, mentality_positioning: int, mentality_vision: int, mentality_penalties: int, mentality_composure: int, defending_standing_tackle: int, defending_sliding_tackle: int, goalkeeping_diving: int, goalkeepin

### 데이터 분리

In [42]:
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=1)

In [43]:
DATA_DIR = "/Users/dongwoo/ml-soccer-player-prediction/data/"

In [44]:
train_df.write.format("parquet").mode('overwrite').save(f"{DATA_DIR}/train/")
test_df.write.format("parquet").mode('overwrite').save(f"{DATA_DIR}/test/")

                                                                                