In [None]:
# Import dependencies
import pandas as pd
import numpy as np
import sqlite3

In [None]:
# Read in csv files
offense_df = pd.read_csv("resources/offense.csv")
defense_df = pd.read_csv("resources/defense.csv")

In [None]:
# Preview the first five rows of the offense dataset
offense_df.head()

In [None]:
# Preview the first five rows of the defense dataset
defense_df.head()

In [None]:
# Drop rows where all columns are NaN
offense_df = offense_df.dropna(how='all')
defense_df = defense_df.dropna(how='all')

In [None]:
# Inspect the information of the dataset (offense and defense should be the same)
offense_df.info()

In [None]:
# Split the record apart and create columns for points for, against, diff, and outcome
dataframes = [offense_df, defense_df]

for dataframe in dataframes:
    
    dataframe[['outcome', 'col1']] = dataframe['score'].str.split(" ", expand=True)
    dataframe['col1'] = dataframe['col1'].str.replace(r"\(", "", regex=True)
    dataframe['col1'] = dataframe['col1'].str.replace(r"\)", "", regex=True)
    dataframe[['points_for', 'points_against']] = dataframe['col1'].str.split("-", expand=True)
    dataframe.drop('col1', axis=1, inplace=True)
    
defense_df

In [None]:
# Fill in the Home/Away information
dataframes = [offense_df, defense_df]

for dataframe in dataframes:
    
    dataframe['home_away'] = dataframe['home_away'].fillna('home')
    dataframe['home_away'] = dataframe['home_away'].str.replace('@', 'away')

In [None]:
# Investigate the dtypes (offense and defense will be indentical)
offense_df.dtypes

In [None]:
# Convert dtypes by creating lists for columns to convert to integers and floating points
columns_to_int = ['passing_cmp', 'passing_att','passing_yds', 'passing_td', 'rushing_att', 
                  'rushing_yds', 'rushing_td', 'total_plays', 'total_yds', 'first_down_pass', 
                  'first_down_rush', 'first_down_pen','first_down_total', 'penalties', 'penalty_yds', 
                  'fumbles','intceptions', 'turnovers', 'points_for', 'points_against']

columns_to_float = ['passing_pct', 'rushing_avg', 'total_avg']

dataframes = [offense_df, defense_df]

for dataframe in dataframes:

    # Convert to numeric values (float) to handle any errors
    dataframe[columns_to_int] = dataframe[columns_to_int].apply(pd.to_numeric, errors='coerce')
    dataframe[columns_to_float] = dataframe[columns_to_float].apply(pd.to_numeric, errors='coerce')

    # Convert the integer columns
    dataframe[columns_to_int] = dataframe[columns_to_int].astype(float).astype(pd.Int64Dtype())

    # Convet the date to datetime using the to_datetime method
    dataframe['date'] = pd.to_datetime(dataframe['date'])

In [None]:
# Confirm that the dataset is ready for loading
offense_df.info()

In [None]:
# Confirm that the dataset is ready for loading
defense_df.info()

In [None]:
# Drop the two rows that do not have complete statistics
offense_df = offense_df.dropna(axis=0)
defense_df = defense_df.dropna(axis=0)

In [None]:
# Final confirmation of data cleaning operations
offense_df.info()

In [None]:
# Final confirmation of data cleaning operations
defense_df.info()

In [None]:
# User input: What school is being analyzed?
school = input(f'What School is this analyzing? ')

# Output the cleaned datasets to a .csv
offense_df.to_csv(f'output/{school}_offense.csv', index=False)
defense_df.to_csv(f'output/{school}_defense.csv', index=False)

# Export to SQLite database
off_table = f'{school}_offense'
def_table = f'{school}_defense'

# Connect to SQLite database
conn = sqlite3.connect('output/cfb.db')
cursor = conn.cursor()

# Write DataFrame to SQLite database
offense_df.to_sql(off_table, conn, if_exists='replace', index=False)
defense_df.to_sql(def_table, conn, if_exists='replace', index=False)

# Commit changes and close the connection
conn.commit()
conn.close()