In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import sqlite3

In [2]:
# Read in csv files
standings_df = pd.read_csv("../resources/standings.csv")

In [3]:
# Preview the first five rows of the offense dataset
standings_df.head()

Unnamed: 0,year,school,conference,wins,losses,winning_pct,conf_wins,conf_losses,conf_winning_pct,ppg_offence,ppg_defence,SRS,SOS,ap_pre,ap_high,ap_post,notes
0,2000,Florida State,ACC,11.0,2.0,0.846,8.0,0.0,1.0,39.3,10.5,23.13,5.59,2.0,1.0,5.0,
1,2000,Clemson,ACC,9.0,3.0,0.75,6.0,2.0,0.75,34.7,21.1,11.54,2.04,17.0,5.0,16.0,
2,2000,Georgia Tech,ACC,9.0,3.0,0.75,6.0,2.0,0.75,32.2,19.8,11.3,1.21,,15.0,17.0,
3,2000,Virginia,ACC,6.0,6.0,0.5,5.0,3.0,0.625,20.2,24.3,0.1,3.18,,,,
4,2000,North Carolina State,ACC,8.0,4.0,0.667,4.0,4.0,0.5,31.6,28.2,3.92,-0.33,,21.0,,


In [4]:
# Get basic information on the dataset
standings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3323 entries, 0 to 3322
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              3323 non-null   int64  
 1   school            2837 non-null   object 
 2   conference        2837 non-null   object 
 3   wins              2837 non-null   float64
 4   losses            2837 non-null   float64
 5   winning_pct       2835 non-null   float64
 6   conf_wins         2724 non-null   float64
 7   conf_losses       2724 non-null   float64
 8   conf_winning_pct  2724 non-null   float64
 9   ppg_offence       2835 non-null   float64
 10  ppg_defence       2835 non-null   float64
 11  SRS               2835 non-null   float64
 12  SOS               2835 non-null   float64
 13  ap_pre            575 non-null    float64
 14  ap_high           1050 non-null   float64
 15  ap_post           575 non-null    float64
 16  notes             32 non-null     object 


In [5]:
# print the columns to drop all the Poll related columns
standings_df.columns

Index(['year', 'school', 'conference', 'wins', 'losses', 'winning_pct',
       'conf_wins', 'conf_losses', 'conf_winning_pct', 'ppg_offence',
       'ppg_defence', 'SRS', 'SOS', 'ap_pre', 'ap_high', 'ap_post', 'notes'],
      dtype='object')

In [6]:
# Drop all the Polls related columns
columns_to_drop = ['ap_pre', 'ap_high', 'ap_post', 'notes']
standings_df.drop(columns=columns_to_drop, inplace=True)

In [7]:
# Get basic information on the dataset
standings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3323 entries, 0 to 3322
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              3323 non-null   int64  
 1   school            2837 non-null   object 
 2   conference        2837 non-null   object 
 3   wins              2837 non-null   float64
 4   losses            2837 non-null   float64
 5   winning_pct       2835 non-null   float64
 6   conf_wins         2724 non-null   float64
 7   conf_losses       2724 non-null   float64
 8   conf_winning_pct  2724 non-null   float64
 9   ppg_offence       2835 non-null   float64
 10  ppg_defence       2835 non-null   float64
 11  SRS               2835 non-null   float64
 12  SOS               2835 non-null   float64
dtypes: float64(10), int64(1), object(2)
memory usage: 337.6+ KB


In [8]:
# Drop columns based on the winning_pct
    # Further investigation of the data shows that the independent conferences
    # will not have conference schedules and the restricting column should be winning_pct

column_to_check = 'winning_pct'
standings_df = standings_df.dropna(subset=[column_to_check])

In [10]:
# Convert the year to datetime
standings_df['year'] = pd.to_datetime(standings_df['year'], format='%Y').dt.year

In [11]:
# Final confirmation that data can be loaded
standings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2835 entries, 0 to 3322
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              2835 non-null   int64  
 1   school            2835 non-null   object 
 2   conference        2835 non-null   object 
 3   wins              2835 non-null   float64
 4   losses            2835 non-null   float64
 5   winning_pct       2835 non-null   float64
 6   conf_wins         2724 non-null   float64
 7   conf_losses       2724 non-null   float64
 8   conf_winning_pct  2724 non-null   float64
 9   ppg_offence       2835 non-null   float64
 10  ppg_defence       2835 non-null   float64
 11  SRS               2835 non-null   float64
 12  SOS               2835 non-null   float64
dtypes: float64(10), int64(1), object(2)
memory usage: 310.1+ KB


In [12]:
# Output the cleaned datasets to a .csv
standings_df.to_csv('../output/standings.csv', index=False)

# Connect to SQLite database
conn = sqlite3.connect('../output/cfb.db')
cursor = conn.cursor()

# Write DataFrame to SQLite database
standings_df.to_sql('standings', conn, if_exists='replace', index=False)

# Commit changes and close the connection
conn.commit()
conn.close()