# Explore Baseball Pitching
Explores the pitching dataset.  It is done in a way that mimics homework assignments so that students get comfortable with this pattern.  A more typical "Explore" script will have many of the same operations, but it won't create and update an answers_df to track the results.  That said, it's not a bad way to track your questions about the dataset and have the answers in a downloadable format.

In [0]:
# Generate data frame for the questions and answers.  
# Since this pattern is typically used for assignments, their is a Points column for each answer.
from pyspark.sql import Row
from pyspark.sql.functions import lit, when, sum

# Specify each row with custom values
rows = [
    Row(Number=1, Question="Rows in your ingested dataset: ", Answer="0", Points=0.0),
    Row(Number=2, Question="Columns in your ingested dataset: ", Answer="0", Points=0.0),
    Row(Number=3, Question="Years in your dataset: ", Answer="0", Points=0.0),
    Row(Number=4, Question="Distinct teams in your dataset: ", Answer="0", Points=0.0),
    Row(Number=5, Question="ERA normal distribution notes: ", Answer="Text", Points=0.0),
    Row(Number=6, Question="Age normal distribution notes: ", Answer="Text", Points=0.0),
    Row(Number=7, Question="Count of players that made this list for all years: ", Answer="0", Points=0.0)
]

# Create a dataframe from the list of rows
answer_df = spark.createDataFrame(rows)

display(answer_df)

In [0]:
pitching_df = spark.read.table("bronze_examples.pitching")

display(pitching_df)

In [0]:
# Question 1 - How many rows are in your ingested dataset?
# Your code
your_answer = pitching_df.count()

# Add answer to the answer data frame.  
answer_df = answer_df.withColumn("Answer", 
                                 when(answer_df.Number == 1, lit(str(your_answer)))
                                 .otherwise(answer_df.Answer))
display(answer_df)

In [0]:
#  Question 2 - How many columns are in your ingested dataset?
# Your code
your_answer = len(pitching_df.columns)

# Add answer to the answer data frame.  
answer_df = answer_df.withColumn("Answer", 
                                 when(answer_df.Number == 2, lit(str(your_answer)))
                                 .otherwise(answer_df.Answer))
display(answer_df)

In [0]:
# Question 3 - How many years are in your ingested dataset?
# Your code
season_count = pitching_df.select('Season').distinct().count()

# Add answer to the answer data frame.  
answer_df = answer_df.withColumn("Answer", 
                                 when(answer_df.Number == 3, lit(str(season_count)))
                                 .otherwise(answer_df.Answer))
display(answer_df)

In [0]:
# Question 4 - How many distinct teams are in your dataset?
# Your code
your_answer = pitching_df.select('Team').distinct().count()

# Add answer to the answer data frame.  
answer_df = answer_df.withColumn("Answer", 
                                 when(answer_df.Number == 4, lit(str(your_answer)))
                                 .otherwise(answer_df.Answer))
display(answer_df)

In [0]:
# Question 5 - Is ERA normally distribed?: 
# Your code

# Normal distribution can be evaluated in a number of ways.  One is to plot the histogram of the data and see if it looks like a bell curve. 
pitching_df.plot.hist(column="ERA", bins=20)

# There are also statistical tests and measures that you can use.


In [0]:
# Question 5 - Is ERA normally distribed?: 
# Your code
your_answer = "While their are a few outliers on the right side, the distribution is close to normal with the most common ERAs between 3.36 and 3.79."

# Add answer to the answer data frame.  
answer_df = answer_df.withColumn("Answer", 
                                 when(answer_df.Number == 5, lit(your_answer))
                                 .otherwise(answer_df.Answer))
display(answer_df)

In [0]:
# Question 6 - Is Age normally distribed?: 
# Your code
# We'll plot a histogram here too.
pitching_df.plot.hist(column="Age", bins=20)

In [0]:
# Question 6 - Is Age normally distribed?: 
# Your code
your_answer = "Age does not look normally distributed.  The histogram does not show a bell curve."

# Add answer to the answer data frame.  
answer_df = answer_df.withColumn("Answer", 
                                 when(answer_df.Number == 6, lit(str(your_answer)))
                                 .otherwise(answer_df.Answer))
display(answer_df)

In [0]:
# Question 7 - how many players are on this list for all years?
# Your code
from pyspark.sql.functions import col

player_repeats_df = pitching_df.groupBy(col('Name')).count()
display(player_repeats_df)

player_repeats_df = player_repeats_df.withColumn('AllYears', when(col('count') == season_count, 1).otherwise(0))

your_answer = player_repeats_df.agg(sum(col('AllYears')).alias('AllYearsCount')).collect()[0]['AllYearsCount']

# Add answer to the answer data frame.  
answer_df = answer_df.withColumn("Answer", 
                                 when(answer_df.Number == 7, lit(str(your_answer)))
                                 .otherwise(answer_df.Answer))
display(answer_df)