In [None]:
import numpy as np
import pandas as pd
from mpl_toolkits import mplot3d # might use this to visualize 3 dimensions
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression # might use this for linear fit
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.graphics.gofplots import qqplot
plt.style.use('fivethirtyeight')
        

# Function for polynomial fit with 
def polyFit(X, y, deg=2):
    """
    function creates linear regression model with polynomial predictors
    
    Inputs:
    X - matrix of predictors
    y - response
    deg - degree of the polynomial
    
    Output:
    lm - fit of polynomial to response
    """
    poly = PolynomialFeatures(degree= deg)
    X_poly = poly.fit_transform(X)
    
    lm = sm.OLS(y, X_poly).fit()
    
    return lm

def studentize_df(df):
    """
    function scales dataframe by taking z-scores
    Input:
    df - pandas dataframe
    
    Output:
    scaled dataframe
    """
    return (df - df.mean())/df.std()

# Function to reorder names
def fix_names(names_list):
    """ 
    Takes list of names in format lastname, firstname and returns list of names in 
    format firstname lastname.
    
    Input:
    names_list - list of names in lastname, firstname format
    
    Output:
    fixed_names - list of names in firstname, lastname format
    """
    
    #using list comprehension. Take name. 1. get rid of comma and space using split. 
    #This creates 2 entries in each list. [::-1] puts them in reverse order
    #join them with a space between. 
    
    fixed_names = [" ".join(name.split(", ")[::-1]) for name in names_list]
    return fixed_names

def lm_diagnostics(model, xlabel="x"):
    ''' 
    create plot of standardized residuals
    create QQ-plot of residuals
    
    Input: 
    model - statsmodel linear model
    xlabel - label for x-axis of plot
    
    Output: 
    None - function creates plot
    

    '''
    fig, axs = plt.subplots(nrows = 1, ncols = 2, sharex = False, sharey = False)
    axs[0].scatter(np.arange(len(model.resid)), studentize_df(model.resid),color='r')
    axs[0].set_title("Studentized Residuals vs {}".format(xlabel))
    axs[0].set_ylabel("Studentized Residuals")
    axs[0].set_xlabel(xlabel)
    #axs[0].show()
    qqplot(model.resid,line='45', ax = axs[1])
    axs[1].set_title("Residual QQ-Plot" )
    axs[1].set_xlabel("Theoretical Quantiles")
    axs[1].set_ylabel("Sample Quantiles")

def lm_plot_rankx(model, df, xname, yname, ascending, title, xlabel, ylabel,fname):
    '''
    Create plots used to show top five offense and defense players. Sort df by predictor column
    Inputs:
        model: statsmodel linear model
        df   : dataframe holding data used in model
        xname: name of predictor column we sort  by
        yname: name of response column
        ascending: direction we should sort data
        title: title of plot
        xlabel: x-label of plot
        ylabel: y-labell of plot
        fname: name figure is saved to
    Output:
        None. plot is displayed and saved.
    
    '''
    #sor df by x
    sorted_df = df.sort_values(by=xname, ascending=ascending)
    #get the first 5 names and corresponding (x,y) values
    top5_names = sorted_df.index[:5]
    top5_y = sorted_df[yname].values[:5]
    top5_x = sorted_df[xname].values[:5]
    #assign unique markers for the 5 names
    marker = ['d','x','^','<','>']

    fig, ax = plt.subplots(1,1)    
    #Create a scatter plot using the remaining (x,y) pairs
    ax.scatter(sorted_df[xname].iloc[5:], sorted_df[yname].iloc[5:],color = 'k')
    #create x- and y-labels
    ax.set_ylabel(yname)
    ax.set_xlabel(xname)
    #use for loop to plot top 5 
    for name, y, x, marker in zip(top5_names, top5_y,top5_x, marker):
        ax.plot(x, y, marker, label =name)
    #limit your axis limits to 3 standard deviations from the mean and label them    
    ax.set_xlim(-3,3)
    ax.set_ylim(-3,3)
    color_top = ['b', 'r', 'g','c','m']

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    
    #label for model predictions. Model is linear for defense and quadratic for offense.
    if len(model.params) == 2:
        label_string = f'EFG = {round(model.params[1],3)}*{xname}+{round(model.params[0],3)}'
    else: 
        label_string = f'EFG = {round(model.params[2],3)}*{xname}^2 + {round(model.params[1],3)}*{xname}+{round(model.params[0],3)}'

    
    ax.plot(df[xname], model.fittedvalues, label=label_string)
    ax.legend(framealpha = 0,fontsize = 'x-small', loc = 'lower left')
    ax.set_title(title)   
    fig.savefig(fname,bbox_inches='tight', pad_inches=0.5)
    
def lm_plot_ranky(model, df, xname, yname, ascending, title, xlabel, ylabel,fname):
        '''
    Create plots used to show top five offense and defense players. Sort df by response column
    Inputs:
        model: statsmodel linear model
        df   : dataframe holding data used in model
        xname: name of predictor column we sort  by
        yname: name of response column
        ascending: direction we should sort data
        title: title of plot
        xlabel: x-label of plot
        ylabel: y-labell of plot
        fname: name figure is saved to
    Output:
        None. plot is displayed and saved.
    
    '''
    
    #sort df by dependent variable then retrieve top five (x,y) pairs 
    #and indices
    sorted_df = df.sort_values(by=yname, ascending=ascending)
    top5_names = sorted_df.index[:5]
    top5_y = sorted_df[yname].values[:5]
    top5_x = sorted_df[xname].values[:5]
    
    #use markers and colors to make top 5 stand out
    marker = ['d','x','^','<','>']
    color_top = ['b', 'r', 'g','c','m']
    
    #create scatter plot of all points other than top 5 then label axes
    fig, ax = plt.subplots(1,1)    
    ax.scatter(sorted_df[xname].iloc[5:], sorted_df[yname].iloc[5:],color = 'k')
    ax.set_ylabel(yname)
    ax.set_xlabel(xname)
    #plot top 5 using designated markers and colors
    for name, y, x, marker, c in zip(top5_names, top5_y,top5_x, marker, color_top):
        ax.plot(x, y, marker,color=c, label =name)
        # names an be printed at points if this is uncommented
        #ax.annotate(name,(x,y),rotation=45+(np.random.rand()-0.5)*45,\
                    #fontsize='xx-small')
        
    #restrict plot window to 3 standard deviations above and below the mean
    #then label the axes using function input
    ax.set_xlim(-3,3)
    ax.set_ylim(-3,3)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    
    #label for model predictions. Model is linear for defense and quadratic for offense.
    if len(model.params) == 2:
        label_string = f'EFG = {round(model.params[1],3)}*{xname}+{round(model.params[0],3)}'
    else: 
        label_string = f'EFG = {round(model.params[2],3)}*{xname}^2 + {round(model.params[1],3)}*{xname}+{round(model.params[0],3)}'
    
    #plot fitted values then save figure 
    ax.plot(df[xname], model.fittedvalues, label=label_string)
    ax.legend(framealpha = 0,fontsize = 'x-small', loc = 'best')
    ax.set_title(title)   
    fig.savefig(fname,bbox_inches='tight', pad_inches=0.5)