# Import software libraries

In [None]:
# Import required libraries.
import sys                                                  # Read system parameters.
import numpy as np                                          # Work with multi-dimensional arrays.
import pandas as pd                                         # Manipulate and analyze data.
import matplotlib                                           # Create and format charts.
import matplotlib.pyplot as plt  
import seaborn as sns                                       # Make charting easier.
import sklearn                                              # Train and evaluate machine learning models.
from sklearn.model_selection import train_test_split, \
                                    learning_curve, \
                                    cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, \
                            explained_variance_score, \
                            mean_absolute_error, \
                            mean_squared_error
from sklearn.dummy import DummyRegressor
import xgboost                                              # Build gradient boosting models.
from xgboost import XGBRegressor
import pickle                                               # Save Python objects as binary files.
from collections import Counter
import warnings                                             # Suppress warnings.
warnings.filterwarnings('ignore')

# Ensure results are reproducible.
np.random.seed(1)

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(matplotlib.__version__))
print('- Seaborn {}'.format(sns.__version__))
print('- scikit-learn {}'.format(sklearn.__version__))
print('- XGBoost {}'.format(xgboost.__version__))

# Read and examine the data

In [None]:
# Read the data.



# Preview the first five rows of the data.



In [None]:
# Check the structure of the data.



# Prepare the data

In [None]:
# Define the target variable and get the count of each value in the variable.





In [None]:
# Plot a histogram of the target variable distribution.




In [None]:
# Split the data into target and features.




In [None]:
# Split the dataset into separate training and testing sets.






# Get the shape of both the training dataset and the test dataset.









In [None]:
# Get summary statistics for the target variable (test data).
# Count, mean, standard deviation, minimum, maximum, etc.



# Train a linear regression model

In [None]:
# Standardize the training data.




# Standardize the test data as well.


In [None]:
# Create a LinearRegression() model and fit it on the scaled training data.




In [None]:
# Make predictions on the test data.



# Get the first 5 predicted values.



# Perform a quick evaluation of the linear regression model

In [None]:
# Obtain the model's R2 score.



# Train a random forest model

In [None]:
# Create a RandomForestRegressor() model and fit it on the scaled training data.




In [None]:
# Make predictions on the test data.



# Get the first 5 predicted values.



# Perform a quick evaluation of the random forest model

In [None]:
# Obtain the model's R2 score.



# Compare evaluation metrics for each model

In [None]:
# List will hold model objects.

models = []

# Dummy Classifier used as a baseline algorithm.

models.append(('Dummy Regressor', DummyRegressor()))

# Linear Regression model.

models.append(('Linear Regression', LinearRegression()))

# Random Forest model.

models.append(('Random Forest', RandomForestRegressor()))

# XGBoost model.

models.append(('XGBoost', XGBRegressor(objective = 'reg:squarederror', n_jobs = 1)))

In [None]:
# List will hold dictionaries of model scores.

scoring_df = []

# Train each model in the list and output multiple scores for each model.

for name, model in models:
    if name in ['Linear Regression']:
        X_train_1 = X_train_stand
    else:
        X_train_1 = X_train
    
    model.fit(X_train_1, y_train)
    y_pred = model.predict(X_test)

    # Calcualte the evaluation metrics for the model.
    
    r2 = r2_score(y_test, y_pred)
    explained_var = explained_variance_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    scoring_dict = {'Model': name,
                    'R2': round(r2, 4),
                    'Mean Absolute Error': round(mae, 4), 
                    'Mean Squared Error': round(mse, 4),
                    }
    
    scoring_df.append(scoring_dict)

In [None]:
# Create a DataFrame from scoring_df.



# Sort the DataFrame by MSE (ascending), then print it.



# Begin evaluating the best model

In [None]:
# Retrain the model with the lowest MSE.




In [None]:
# Make predictions on the test data.



# Get the first 5 predictions.



In [None]:
# Generate a residual scatter plot.









# Generate a feature importance plot for the best model

In [None]:
# This function generates a feature importance plot on a bar chart.

def feature_importance_plot(model, X_train, n):
    """Plots feature importance. This only works for random forest and XGBoost models."""
    
    plt.figure(figsize=(8, 5))  # Set figure size.
    feat_importances = pd.Series(model.feature_importances_,
                                 index = X_train.columns)
    feat_importances.nlargest(n).plot(kind = 'barh')
    plt.title(f'Top {n} Features')
    plt.show()

In [None]:
# Plot the feature importances.



# Plot a learning curve for the best model

In [None]:
# This function generates and plots a learning curve.

def plot_learning_curves(model, X_train, y_train):
    """Plots learning curves for model validation."""
    
    plt.figure(figsize=(5, 5))  # Set figure size.
    train_sizes, train_scores, test_scores = learning_curve(model,
                                                            X_train,
                                                            y_train,
                                                            cv = 5,  # Number of folds in cross-validation.
                                                            scoring = 'neg_mean_squared_error',  # Evaluation metric.
                                                            n_jobs = 1,
                                                            shuffle = True,
                                                            train_sizes = np.linspace(0.01, 1.0, 5))  # 5 different sizes of the training set.

    # Create means and standard deviations of training set scores.
    
    train_mean = np.mean(train_scores, axis = 1)
    train_std = np.std(train_scores, axis = 1)

    # Create means and standard deviations of test set scores.
    
    test_mean = np.mean(test_scores, axis = 1)
    test_std = np.std(test_scores, axis = 1)

    # Draw lines.
    
    plt.plot(train_sizes, train_mean, '--', color = '#111111', label = 'Training score')
    plt.plot(train_sizes, test_mean, color = '#111111', label = 'Cross-validation score')

    # Create plot.
    
    plt.title('Learning Curves')
    plt.xlabel('Training Set Size'), plt.ylabel('Negative MSE'), plt.legend(loc = 'best')
    plt.tight_layout()
    
    plt.show()

In [None]:
# Call the function to plot learning curves for the best model.
# Keep in mind that this is using negative MSE, so the lower (better) scores are at the top of the y-axis.



# Save the best model

In [None]:
# Save the best model as a pickle file named best_regression_model.pickle.

