# Capstone: Pre-processing and Training Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os
import sys

## Import of csv's created in nfl_capstone_EDA

In [3]:
year = pd.read_csv('teamstarterdraft.csv')
yearAV = pd.read_csv('teamstarterdraftAV.csv')
week = pd.read_csv('weekstarterdraft.csv')
weekAV = pd.read_csv('weekstarterdraftAV.csv')

### Creation of 2 additional dataframes that do not include the categorical variables of coach, offcoor, defcoor, off scheme, and def align.  If these do not add much to the variance, they are just preventing tree based algorithms from functioning well, as the number of dummy columns is large

In [5]:
yearnocoach = year.drop(columns=['coach', 'offcoor', 'defcoor', 'offscheme', 'defalign'])
yearnocoachAV = yearAV.drop(columns=['coach', 'offcoor', 'defcoor', 'offscheme', 'defalign'])

## Create pipeline for Regression analysis

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

year_X = year.drop('DraftTeamSelection', axis=1)
year_y = year['DraftTeamSelection']

yearAV_X = yearAV.drop('DraftTeamSelection', axis=1)
yearAV_y = yearAV['DraftTeamSelection']

week_X = week.drop('DraftTeamSelection', axis=1)
week_y = week['DraftTeamSelection']

weekAV_X = weekAV.drop('DraftTeamSelection', axis=1)
weekAV_y = weekAV['DraftTeamSelection']

categorical_features = list(year.select_dtypes(include=['category', object]).columns)
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

numeric_features = list(year.select_dtypes(include=['int', 'float']).columns)
numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

year_X_train, year_X_test, year_y_train, year_y_test = train_test_split(year_X, year_y, test_size=0.33, random_state=42)

yearAV_X_train, yearAV_X_test, yearAV_y_train, yearAV_y_test = train_test_split(yearAV_X, yearAV_y, test_size=0.33, random_state=42)

week_X_train, week_X_test, week_y_train, week_y_test = train_test_split(week_X, week_y, test_size=0.33, random_state=42)

weekAV_X_train, weekAV_X_test, weekAV_y_train, weekAV_y_test = train_test_split(weekAV_X, weekAV_y, test_size=0.33, random_state=42)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LinearRegression())])

In [42]:
clf.fit(year_X_train, year_y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [44]:
clf.predict(year_X_test)
print("model score: %.3f" % clf.score(year_X_test, year_y_test))

model score: -0.146
