# Creating Necessary Functions

We'll create some functions below that we will be using throughout the course of the notebook. This will allow for our notebook to be much cleaner by using less code and allowing us to use these functions readily.

In [None]:
def csv_import(csv):
    '''Loads the requested csv file, and provides its shape, summary information, and count of null values
    
        @params
        csv is a .csv file
        df is a pd.DataFrame
        
        @output
        a pd.Dataframe
    '''
    df = pd.read_csv(csv)
    print('DATA FRAME SHAPE')
    print("========================================") #this line breaks really help to separately visualize the data
    print(df.shape)
    print("")
    print('DATA SUMMARY')
    print("========================================")
    print(df.info())
    print("")
    print("VALUE COUNTS FOR COLUMNS WITH NULL VALUES")
    print("========================================")
    #create list of Null columns we are going to apend to in a for loop
    null_cols = []
    for col in df.columns:
        if df[col].isna().sum() > 0: #if there are any null values above 0 in the column
            null_cols.append(col) #append our null_cols list and add those columns
    for col in null_cols: #now, for the values in null_cols
        print(col, "VALUE COUNTS") #print the column name and the value count of null values
        print("--------------------") 
        print(df[col].value_counts(dropna=False))  #print those value counts for each column
        print("")
    return df

def describe_outliers(df):
    '''Calls the summary function, while also identifying the value three standard deviations away from the mean
    
            @params
            df is a pd.DataFrame
    
            @output
            summary of a pd.DataFrame.describe() with 3 standard deviations added
    '''
    print('DATA DESCRIPTION')
    print("========================================")
    describe = df.describe() #create a dateframe, using new cloumns from descriptions of df 
#Below, use loc to select create 2 new columns which will be created using standard deviation formula
    describe.loc['+3_std'] = describe.loc['mean'] + (describe.loc['std'] * 3)
    describe.loc['-3_std'] = describe.loc['mean'] - (describe.loc['std'] * 3)
    print(describe)
    
def draw_qqplot(residual):
    fig, ax = plt.subplots(figsize=(6,2.5))
    _, (__, ___, r) = sp.stats.probplot(residual, plot=ax, fit=True)
    
    return fig, ax 

def draw_scatter(y_pred, residual):
    fig, ax = plt.subplots(figsize=(6,2.5))
    _ = ax.scatter(y_pred, residual, color='blue')
    
    return fig, ax

def high_corr(df):
    '''Produces column interactions with a p value of .70 - .99
    
            @params
            df is a pd.DataFrame
    
            @output
            a df of highly-correlated columns and their pearson correlation score
    '''
    #create a new correlated dataframe with absolute value of a number,
    df_highcorr = df.corr().abs().stack().reset_index().sort_values(0, ascending=False)
    df_highcorr['Highly Correlated Pairs'] = list(zip(df_highcorr.level_0, df_highcorr.level_1))
    df_highcorr.set_index(['Highly Correlated Pairs'], inplace = True)
    df_highcorr.drop(columns=['level_1', 'level_0'], inplace = True)
    df_highcorr.columns = ['Correlation']
    df_highcorr.drop_duplicates(inplace=True)
    return df_highcorr[(df_highcorr.Correlation>.7) & (df_highcorr.Correlation<1)]
    
def log_transform(df,features):
    '''Runs a log transformation on a feature
    
        @params
        df is a pd.Dataframe
        features is a list of columns to be considered
        
        @output
        new log-transformed column
    
    '''
    for feature in features:
        df[feature + '_log'] = np.log(df[feature])
    return df
    
def quick_corrmap(df, features):
    '''Quickly produces a correlation heatmap
    
            @params
            df is a pd.DataFrame
            features is a list of columns to be considered
    
            @output
            correlation heat map
    '''
    mask = np.zeros_like(df[features].corr(), dtype=np.bool) #masking visually cuts the graph in half
    mask[np.triu_indices_from(mask)] = True 

    f, ax = plt.subplots(figsize=(16, 12))
    plt.title('Pearson Correlation Matrix',fontsize=25)

    sns.heatmap(df[features].corr(),linewidths=0.25,vmax=0.7,square=True,cmap="BuGn", #"BuGn_r" to reverse 
            linecolor='w',annot=True,annot_kws={"size":8},mask=mask,cbar_kws={"shrink": .9});

def reg_summary(X_train, y_train):
    X_with_constant = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_with_constant)
    results = model.fit()
    print(results.params)
    print(results.summary())
    return results
    
def remove_outliers(df, features):
    '''Removes outliers more than 3 standard deviations away from the mean for each listed feature
    
        @params
        df is a pd.Dataframe
        features is a list of columns to be considered
        
        @output
        df with outliers removed
    '''
    print("COUNT OF OUTLIERS REMOVED")
    print("========================================")
    
    x = len(df)
    
    for feature in features:
        df[feature + '_zscore'] = np.abs(stats.zscore(df[feature]))
        y = df.loc[np.abs(df[feature + '_zscore']) > 3]
        percent = round((len(y) * 100) / x, 3)
        
        print(len(y), "outliers removed for", feature)
        
        df = df.loc[np.abs(df[feature + '_zscore']) < 3]
        
        df = df.drop([feature + '_zscore'], axis=1)
    
    return df
    
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included
    
def validate_changes(df):
    '''Loads the requested df, and provides its shape and count of null values; should be run after initial
        transformation
    
        @params
        df is a pd.DataFrame
        
        @output
        summary data
    
    '''
    print('DATA FRAME SHAPE')
    print("========================================")
    print(df.shape)
    print("")
    print("COLUMNS THAT STILL HAVE NULL VALUES AFTER TRANSFORMATION")
    print("(Should Produce No Results Below Line)")
    print("========================================")      
    null_cols = []
    for col in df.columns:
        if df[col].isna().sum() > 0:
            null_cols.append(col)
    for col in null_cols:
        print(col, "VALUE COUNTS")
        print("--------------------")
        print(df[col].value_counts(dropna=False))  
        print("")
    print("")
    return df.head()


def validate_reg_assumptions(X, X_train, X_test, y_train, y_test):
    results = reg_summary(X_train, y_train)
    print(results)
    
    print('\nIdentifying Residuals...\n')
    X_test = sm.add_constant(X_test)
    y_pred = results.predict(X_test)
    residual = y_test - y_pred

    print('\nVerifying Normality of Residuals...\n')
    sns.distplot(residual)
    plt.show();
    draw_qqplot(residual)
    plt.show();
    print('Mean of Residuals: ', np.mean(residual))

    print('\nDisplaying Regplot...\n')
    draw_scatter(y_pred, residual)
    sns.regplot(y_pred, residual, color='red')