In [5]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels as sm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

In [6]:
# 1. Process and generate the initial data set for modeling
np.random.seed(414)     # Q1

X = np.linspace(0, 15, 1000)
y = 3 * np.sin(X) + np.random.normal(1 + X, .2, 1000)

# 2. Split the data into training and testing sets.
train_X, train_y = X[:700], y[:700]
test_X, test_y = X[700:], y[700:]

train_df = pd.DataFrame({'X': train_X, 'y': train_y})
test_df = pd.DataFrame({'X': test_X, 'y': test_y})

# 3. Build models
li_regrs = smf.ols(formula='y ~ 1 + X', data=train_df).fit()
qd_regrs = smf.ols(formula='y ~ 1 + X + I(X**2)', data=train_df).fit()

### QUESTION

Q1: How does the parameter work in np.random.seed()? I understand that fixes the random variable to get always the same result, but not sure what the number 414 implies

In [7]:
# 4. Test models

# 4.1 Predictors
li_train_pred = li_regrs.predict(train_df)
li_test_pred = li_regrs.predict(test_df)

qd_train_pred = qd_regrs.predict(train_df)
qd_test_pred = qd_regrs.predict(test_df)

# 4.2 Mean Square Error
li_train_error = mean_squared_error(train_df['y'], li_train_pred)
li_test_error = mean_squared_error(test_df['y'], li_test_pred)

qd_train_error = mean_squared_error(train_df['y'], qd_train_pred)
qd_test_error = mean_squared_error(test_df['y'], qd_test_pred)

li, qd = (li_train_error, li_test_error), (qd_train_error, qd_test_error) 

# 4.3 Results
regression_fit = pd.DataFrame({'Linear': li, 'Quadratic': qd}, index=('Train', 'Test'))
regression_fit

Unnamed: 0,Linear,Quadratic
Train,4.05677,3.790046
Test,6.547541,7.987383


Seems like the quadratic regression is overfitted, having a lower MSE for the train data, but not for the test data in comparison with the linear regression

### QUESTION
I'm trying to scatter the test data and plot both regressions with no success, any ideas?