In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import sqlite3

In [2]:
# Read in csv files
offense_df = pd.read_csv("../resources/all_offense.csv")
defense_df = pd.read_csv("../resources/all_defense.csv")

In [3]:
# Preview the first five rows of the offense dataset
offense_df.head()

Unnamed: 0,year,school,games,points,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,...,total_avg,first_down_pass,first_down_rush,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers
0,2000,Boise State,11.0,44.9,20.5,33.8,60.5,321.5,3.4,40.0,...,6.7,13.5,8.9,1.5,23.9,6.2,63.5,0.6,0.7,1.4
1,2000,Miami (FL),11.0,42.6,18.0,30.9,58.2,266.0,2.4,39.5,...,6.5,12.1,7.7,1.4,21.2,8.8,86.5,1.5,0.5,1.9
2,2000,Florida State,12.0,42.4,24.2,39.1,61.8,384.0,3.0,37.9,...,7.1,14.5,8.0,2.8,25.3,10.2,93.9,0.7,1.2,1.8
3,2000,Nebraska,11.0,41.5,7.6,15.6,48.8,110.6,1.0,57.8,...,6.3,5.2,16.8,0.9,22.9,6.5,56.5,0.8,0.7,1.5
4,2000,Virginia Tech,11.0,40.3,10.5,20.5,51.3,155.9,0.8,51.8,...,5.9,6.5,14.7,1.2,22.5,6.4,55.7,1.5,0.6,2.1


In [4]:
# Preview the first five rows of the defense dataset
defense_df.head()

Unnamed: 0,year,school,games,points,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,...,total_avg,first_down_pass,first_down_rush,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers
0,2000,TCU,11.0,9.6,13.0,29.4,44.3,160.6,0.9,35.9,...,3.8,6.6,5.8,2.1,14.5,6.4,46.8,1.1,1.4,2.5
1,2000,Florida State,12.0,10.3,18.3,37.3,49.2,203.1,0.6,32.3,...,4.0,9.0,4.9,2.4,16.3,11.1,89.4,1.1,1.6,2.7
2,2000,Toledo,11.0,11.4,16.4,30.7,53.3,187.5,0.5,33.2,...,4.2,8.5,5.1,0.9,14.5,5.5,43.3,1.5,1.4,2.8
3,2000,Western Michigan,12.0,11.6,14.9,28.8,51.9,178.0,0.8,38.2,...,4.2,7.9,6.5,1.1,15.5,5.7,45.4,0.8,0.8,1.5
4,2000,Miami (FL),11.0,15.5,19.6,38.9,50.5,220.6,1.0,35.7,...,4.5,9.8,6.5,2.2,18.5,7.5,55.4,0.9,2.1,3.0


In [5]:
# Inspect the information of the dataset (offense and defense should be the same)
offense_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3087 entries, 0 to 3086
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              3087 non-null   int64  
 1   school            2835 non-null   object 
 2   games             2835 non-null   float64
 3   points            2835 non-null   float64
 4   passing_cmp       2835 non-null   float64
 5   passing_att       2835 non-null   float64
 6   passing_pct       2835 non-null   float64
 7   passing_yds       2835 non-null   float64
 8   passing_td        2835 non-null   float64
 9   rushing_att       2835 non-null   float64
 10  rushing_yds       2835 non-null   float64
 11  rushing_avg       2835 non-null   float64
 12  rushing_td        2835 non-null   float64
 13  total_plays       2835 non-null   float64
 14  total_yds         2835 non-null   float64
 15  total_avg         2835 non-null   float64
 16  first_down_pass   2835 non-null   float64


In [6]:
# Drop rows where the year was filled in with no statistics (line break in the table)
offense_df = offense_df.dropna()
defense_df = defense_df.dropna()

In [7]:
# Convert the year to datetime
offense_df['year'] = pd.to_datetime(offense_df['year'], format='%Y').dt.year
defense_df['year'] = pd.to_datetime(defense_df['year'], format='%Y').dt.year

In [8]:
offense_df

Unnamed: 0,year,school,games,points,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,...,total_avg,first_down_pass,first_down_rush,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers
0,2000,Boise State,11.0,44.9,20.5,33.8,60.5,321.5,3.4,40.0,...,6.7,13.5,8.9,1.5,23.9,6.2,63.5,0.6,0.7,1.4
1,2000,Miami (FL),11.0,42.6,18.0,30.9,58.2,266.0,2.4,39.5,...,6.5,12.1,7.7,1.4,21.2,8.8,86.5,1.5,0.5,1.9
2,2000,Florida State,12.0,42.4,24.2,39.1,61.8,384.0,3.0,37.9,...,7.1,14.5,8.0,2.8,25.3,10.2,93.9,0.7,1.2,1.8
3,2000,Nebraska,11.0,41.5,7.6,15.6,48.8,110.6,1.0,57.8,...,6.3,5.2,16.8,0.9,22.9,6.5,56.5,0.8,0.7,1.5
4,2000,Virginia Tech,11.0,40.3,10.5,20.5,51.3,155.9,0.8,51.8,...,5.9,6.5,14.7,1.2,22.5,6.4,55.7,1.5,0.6,2.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3082,2022,Colorado,12.0,15.4,14.9,30.0,49.7,172.9,0.8,32.3,...,4.5,7.2,7.2,1.6,15.9,5.3,48.3,1.0,0.8,1.8
3083,2022,Northwestern,12.0,13.8,20.4,34.8,58.8,210.3,0.8,39.1,...,4.5,9.5,7.8,1.7,19.0,4.6,40.8,1.2,1.4,2.6
3084,2022,Colorado State,12.0,13.2,16.9,26.8,63.0,196.8,1.0,32.1,...,4.9,7.9,5.5,1.3,14.7,6.9,60.4,0.8,0.9,1.7
3085,2022,New Mexico,12.0,13.1,11.4,21.3,53.7,104.9,0.3,36.6,...,3.9,4.1,7.7,1.6,13.3,6.1,51.2,0.4,0.8,1.2


In [9]:
# Confirm that the dataset is ready for loading
offense_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2835 entries, 0 to 3086
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              2835 non-null   int64  
 1   school            2835 non-null   object 
 2   games             2835 non-null   float64
 3   points            2835 non-null   float64
 4   passing_cmp       2835 non-null   float64
 5   passing_att       2835 non-null   float64
 6   passing_pct       2835 non-null   float64
 7   passing_yds       2835 non-null   float64
 8   passing_td        2835 non-null   float64
 9   rushing_att       2835 non-null   float64
 10  rushing_yds       2835 non-null   float64
 11  rushing_avg       2835 non-null   float64
 12  rushing_td        2835 non-null   float64
 13  total_plays       2835 non-null   float64
 14  total_yds         2835 non-null   float64
 15  total_avg         2835 non-null   float64
 16  first_down_pass   2835 non-null   float64


In [10]:
# Confirm that the dataset is ready for loading
defense_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2835 entries, 0 to 3086
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              2835 non-null   int64  
 1   school            2835 non-null   object 
 2   games             2835 non-null   float64
 3   points            2835 non-null   float64
 4   passing_cmp       2835 non-null   float64
 5   passing_att       2835 non-null   float64
 6   passing_pct       2835 non-null   float64
 7   passing_yds       2835 non-null   float64
 8   passing_td        2835 non-null   float64
 9   rushing_att       2835 non-null   float64
 10  rushing_yds       2835 non-null   float64
 11  rushing_avg       2835 non-null   float64
 12  rushing_td        2835 non-null   float64
 13  total_plays       2835 non-null   float64
 14  total_yds         2835 non-null   float64
 15  total_avg         2835 non-null   float64
 16  first_down_pass   2835 non-null   float64


In [11]:
# Output the cleaned datasets to a .csv
offense_df.to_csv('../output/all_offense.csv', index=False)
defense_df.to_csv('../output/all_defense.csv', index=False)

# Connect to SQLite database
conn = sqlite3.connect('../output/cfb.db')
cursor = conn.cursor()

# Write DataFrame to SQLite database
offense_df.to_sql('all_offense', conn, if_exists='replace', index=False)
defense_df.to_sql('all_defense', conn, if_exists='replace', index=False)

# Commit changes and close the connection
conn.commit()
conn.close()