# Module 03: EDA

In [None]:
# packages
import numpy as np 
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
from sklearn.model_selection import train_test_split 
from ISLP import load_data

# set seed
seed = 2323

### We'll use the _Hitters_ data from ISLP for this activity. The metadata for _Hitters_ can be found [here](https://intro-stat-learning.github.io/ISLP/datasets/Hitters.html).

In [None]:
# Load the data
Hitters = load_data('Hitters')

### Determine the number of rows and columns in the dataset by returning its "shape" attribute

In [None]:
#fillin
Hitters.shape

### Determine whether each feature is numeric or categorical by returning the "dtype" attribute for each column

In [None]:
for col in Hitters.columns:
    print(col, Hitters[col].dtype)

### Before doing any other analyses, let's create training and test sets.

In [None]:
Train, Test = train_test_split(Hitters, 
                               random_state=seed, 
                               test_size=0.40, 
                               shuffle=True) 

### Based on the metadata, what is the difference between the 6 columns starting with 'C' and the 6 related columns that don't?

Cols that start with C are cart placement and related cols are completed purchases

### On the training set, create pairwise scatterplots for each of these 6 columns with the 'Salary' variable.

In [None]:
c_cols = ['CAtBat','CHits','CHmRun','CRuns','CRBI','CWalks']
c_cols = [c for c in c_cols if c in Train.columns]

fig, axes = subplots(2, 3, figsize=(12, 7))
axes = axes.ravel()

for ax, col in zip(axes, c_cols):
    ax.scatter(Train[col], Train['Salary'], alpha=0.6)
    ax.set_xlabel(col)
    ax.set_ylabel('Salary')

plt.tight_layout()
plt.show()

### Use the "describe" method to determine the mean, standard deviation, and 5 number summary of all numeric variables in the training subset of _Hitters_.

In [None]:
Train.describe()

### It looks like the mean and median of 'AtBat' are nearly equal. This _might_ suggest that this variable is normally distributed. Create a histogram of 'AtBat' to check this hypothesis.

In [None]:
plt.hist(Train['AtBat'].dropna(), bins=20)
plt.title('Histogram of AtBat (Training Set)')
plt.xlabel('AtBat')
plt.ylabel('Frequency')
plt.show()

### Let's standardize the AtBat feature (i.e., normalize by z-scores). We'll create a new column in the training data called 'AtBat_st' to represent this.

In [None]:
mu = Train['AtBat'].mean()
sigma = Train['AtBat'].std(ddof=0)
Train['AtBat_st'] = (Train['AtBat'] - mu) / sigma
Train[['AtBat','AtBat_st']].head()

### How many rows have an 'AtBat' value within the first standard deviation?

Hint: the 'len' magic method returns the number of rows of a dataFrame.

In [None]:
len(Train[np.abs(Train['AtBat_st']) <= 1])

### Going back to the results of the 'describe' method, how can you tell that the 'Salary' variable has missing values?

Describe output shows lower count for Salary compared to other cols, means some vals are missing.

### Describe a situation where a variable could have missing values but this would not be reflected in the results of the 'describe' method.

If missing vals are shown by placeholder nums instead of NaN, describe will not show them missing.

### On the training data, create separate boxplots of the 'AtBat' variable for when 'Salary' is populated or missing.

In [None]:
mask_missing = Train['Salary'].isna()
plt.boxplot([Train.loc[~mask_missing, 'AtBat'].dropna(), Train.loc[mask_missing, 'AtBat'].dropna()],
            labels=['Salary present', 'Salary missing'])
plt.title('AtBat by Salary Missingness (Training Set)')
plt.ylabel('AtBat')
plt.show()

### Create a correlation matrix for all numeric features in the training set

In [None]:
Train.select_dtypes(include=[np.number]).corr()

### Propose two different ways of imputing the missing values of Salary while taking advantage of the information given in the boxplots or the correlation matrix.

Impute missing salaries using median based on boxplot or use regression imputation with vars that highly corelated with sal from correlation matrix.

### For our last exercise, we'll explore Hits and Walks relative to AtBat totals. 
- Use the sum function to calculuate the totals of each of these three variables for the 1986 season (on the training set). 
- Create a pie chart which shows total hits, total walks, and remaining total (neither) as percents of the At Bats total (on the training set). 

In [None]:
TotHits = Train['Hits'].sum()
TotWalks = Train['Walks'].sum()
TotAtBat = Train['AtBat'].sum()

Labels = ['Hits', 'Walks', 'Neither']
Totals = [TotHits, TotWalks, TotAtBat-TotHits-TotWalks]

In [None]:
plt.pie(Totals, labels=Labels, autopct='%1.1f%%')
plt.title('Totals: Hits vs Walks vs Neither (Training Set)')
plt.axis('equal')
plt.show()

### The previous two cells gave us totals across all players. For each player in the training set, calculate the Hits as a percent of AtBat and store it in a new variable called 'AVG'

In [None]:
Train['AVG'] = Train['Hits'] / Train['AtBat']
Train['AVG'].head()

### Using 0.25 and 0.31 as the split points, create a new variable with three bins: high, medium, and low. 

In [None]:
Train['AVG_bin'] = 'medium'
Train.loc[Train['AVG'] < 0.25, 'AVG_bin'] = 'low'
Train.loc[Train['AVG'] >= 0.31, 'AVG_bin'] = 'high'
Train[['AVG','AVG_bin']].head()

### Create a bar chart that displays the number of players in each of the low, medium, and high categories (for the training data).

In [None]:
Train['AVG_bin'].value_counts()

Notice that the order of the bars will be medium, low, high. That's counterintuitive. We can reorder these quickly. 

In [None]:
indexMap = ['low', 'medium', 'high']
reordered_list = [Train['AVG_bin'].value_counts()[i] for i in indexMap]

In [None]:
plt.bar(range(len(indexMap)), reordered_list)

plt.title('1986 AVG (Training Set)')
plt.ylabel('Number of Players')

plt.xticks(range(len(indexMap)), indexMap)

plt.show()

### Did we use the depth method or width method for creating these bins? Explain.

Used  equalvdepth method bc each bin has about same num of obseravations instead of just covering same numeric range.