In [1]:
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error


In [2]:
def create_dummy_df(df, dummy_cols:list, dummy_na=False, drop_first=False):
    '''
    This function creates a new dataframe with dummy columns from the original dataframe.
    
    Parameters:
    df : DataFrame
      DataFrame of which to create dummy columns from.
    dummy_cols: list
      categorical column names to create dummy columns from.
    dummy_na: boolean, default False
      Add a column to indicate NaNs, if False NaNs are ignored.
    drop_first: boolean, default False
      Whether to get k-1 dummies out of k categorical levels by removing the first level.
      
    Returns: 
    df: DataFrame
      new DataFrame that removes columns in dummy_cols, with dummy columns created from categorical columns in dummy_cols and other original columns in df.
    '''
    dummy_df = pd.get_dummies(df[dummy_cols], dummy_na=dummy_na, drop_first=drop_first)
    df = pd.concat([df.drop(dummy_cols,axis=1), dummy_df], axis=1)
    return df

In [3]:
def linear_regression(df,X_cols: list, y_col:str, test_size=.3,random_state=42):
    '''
    This function splits df into train and test sets, trains and fits linear regression, predicts on the train and test sets and gets R-squred scores. 
    
    Parameters:
    df: DataFrame
      data to uses linear regression on.
    X_cols: list
      list of columns names that are used as regressors.
    y_col: str
      target column name
    test_size: float, default 0.3
      proportion of the dataset to include in the test split.
    random_state: int, default 42
      Controls the shuffling applied to the data before applying the split.
      
    Returns:
    X_train: feature train set 
    X_test: feature test set
    y_train: target train set
    y_test: target test set
    lm_model: linear regression model that's trained and fitted by X_train and y_train
    train_score: R-squared score on the train set
    test_score: R-squared score on the test set
    '''
    X = df[X_cols]
    y = df[y_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size, random_state=random_state)
    lm_model = LinearRegression(normalize=True)
    lm_model.fit(X_train,y_train)
    
    y_train_preds = lm_model.predict(X_train)
    y_test_preds = lm_model.predict(X_test)
    
    train_score = r2_score(y_train, y_train_preds)
    test_score = r2_score(y_test, y_test_preds)
    
    return X_train, X_test, y_train, y_test, lm_model, train_score, test_score