# WSA Pandas Demo

### Imports
* The `pandas` library should already be installed as part of the Anaconda distribution
* Typically abbreviated as `pd`

In [None]:
import pandas as pd

### Reading in a DataFrame
* We will start by creating a dataframe called `df` containing the Michigan Football GameLog data we have stored in our SQL database.
* This data will be exported and read into Jupyter notebook as a `.csv` file.
* Make sure your `.csv` file is stored in the same folder as this notebook.

In [None]:
# .read_csv()
# drop unneeded id column with .drop()

In [None]:
# .head() shows first 5 rows


In [None]:
# .tail() shows last 5 rows


In [None]:
# .head(2) shows first 2 rows

### Data Preprocessing

In [None]:
# .info() provides essential details for your dataset

In [None]:
# .shape shows dimensions of df (num_rows, num_cols)

#### Null and Duplicate Values

In [None]:
# We can append dataframes to each other using pd.concat()
# It is good practice to never change or update the original df directly
# Always create copies!

In [None]:
# We can also remove duplicate values with .drop_duplicates()

In [None]:
# .isnull() checks for NULL or empty datapoints 
# returns True for any values that are NULL


In [None]:
# .sum() gets a summary of the above df


In [None]:
# If we did have null values, we could use the .dropna() function to remove all rows with null values


#### Modifying DataFrame Columns

In [None]:
# check column information with .columns


In [None]:
# .rename() allows us to rename our columns
df.rename(columns = {
    'home_away' : 'location',
    'points_scored' : 'pts_scored',
    'points_against' : 'pts_against',
    'pass_td' : 'pass_tds',
    'pass_1st_down' : 'pass_first_downs',
    'rush_td' : 'rush_tds',
    'rush_1st_down' : 'rush_first_downs'
}, inplace = True)


In [None]:
# Even though the values of year are numeric, we are treating it as a categorical variable
# We can change year to an object type using .astype()
df.year = df.year.astype('object') 
df.info()

### Indexing into our DataFrame

To access a column, we can use bracket notation.

In [None]:
# .describe() gets us a numerical summary of each column that uses numbers
# .round() formats the output


In [None]:
df['pts_scored'].describe()

In [None]:
# This returns a series
df['opponent']

In [None]:
# This returns a df
df[['opponent']]

In [None]:
# This returns a df with multiple columns
df[['opponent', 'result']]

In [None]:
# .value_counts() lets us count the number of times a specific value is in the column
# using .head(n) to show the n most frequent opponents

In [None]:
# .corr(numeric_only=true) shows us the correlation between any combination of columns that have numbers
# Values closer to 1 indicate a higher correlation between variables


#### Accessing Rows of a DataFrame

There are two ways to access a row using indices:
* `loc[]` requires label indexing 
* `iloc[]` uses integer indexing

In [None]:
# .set_index() to existing column
alt_df = df.set_index('date')
alt_df.head(5)

In [None]:
# Returns a Series
row1_loc = alt_df.loc['2011-09-10']
row1_loc

In [None]:
row1_iloc = alt_df.iloc[1]
row1_iloc

In [None]:
# To access multiple rows, we can slice the df
hoke_era = df.iloc[:51]
hoke_era

To access a specific value within a df, we access by row, and then by column.

In [None]:
rush_yrds_natty = df.iloc[164]['rush_yrds']
rush_yrds_natty

In [None]:
rush_off_corr = corr_df['rush_yrds']['total_offense']
rush_off_corr

### Analyzing Data With `groupby()`

In [None]:
# We can use the .groupby() function to form groups within our df based on a certain column


In [None]:
# We can access each of the groups using the .get_group() function


In [None]:
# We can also use .groupby() paired with other functions to get some key calculations
# Ex: mean of grouped W-L results
results = df.groupby('result', sort=False)
results.mean(numeric_only=True).round(4)

In [None]:
# Ex: .mean() with single column
years.mean(numeric_only=True)['rush_tds']

In [None]:
# .sum()


In [None]:
# .max()


In [None]:
# .min()


In [None]:
# .median()
