## Contestant Success Prediction with Machine Learning
Grace E. Chesmore

Here we predict a Bachelor contestant's longevity in the season [episode number] given a characteristic [age,race,occupation].  We train a series of machine learning regular linear-regression models with the sklearn package.  Contestant data is provided by XXX.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib
import matplotlib.pyplot as plt
import csv
import pandas as pd
import seaborn as sns
import math

from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor

%reload_ext blackcellmagic
%matplotlib inline
matplotlib.rcParams["font.family"] = "Helvetica Neue"
matplotlib.rcParams["font.size"] = 18
# plt.rcParams["axes.unicode_minus"] = False
matplotlib.rcParams.update({"axes.grid" : True, "grid.color": "grey", "grid.alpha": .2, 'xtick.direction':'in','ytick.direction':'in'})
%config InlineBackend.figure_format = 'retina'

In [None]:
data = csv.reader(open("bach_data.csv"), delimiter=",")
name, age, hometown, job, race, place, season, oneonone = [], [], [], [], [], [], [], []

for row in data:
    name.append(row[0])
    age.append(row[1])
    hometown.append(row[2])
    job.append(row[3])
    race.append(row[4])
    place.append(row[5])
    season.append(row[6])
    oneonone.append(row[7])

season = season[1:]
place = place[1:]
race = race[1:]
name = name[1:]
job = job[1:]
hometown = hometown[1:]
age = age[1:]
oneonone = oneonone[1:]

for ii in range(len(age)):
    age[ii] = float(age[ii])
for ii in range(len(season)):
    season[ii] = float(season[ii])
for ii in range(len(place)):
    place[ii] = float(place[ii])
for ii in range(len(oneonone)):
    if oneonone[ii] == "":
        oneonone[ii] = 0
    else:
        oneonone[ii] = int(oneonone[ii])

for ii in range(len(race)):
    if race[ii] == "White":
        race[ii] = 0
    elif race[ii] == "Black":
        race[ii] = 1
    elif race[ii] == "Asian":
        race[ii] = 2
    elif race[ii] == "Hispanic":
        race[ii] = 3
    else:
        race[ii] = 4

age_set_tot = []
ssns = np.array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25])
for ii in range(len(ssns)):
    age_set = np.array(age)[np.where(np.array(season) == ssns[ii])]
    age_set_tot.append(age_set)

fig = plt.figure(figsize=(7, 4))
ax = fig.add_axes([0, 0, 1, 1])
ax.set_xticklabels(
    [
        "11",
        "12",
        "13",
        "14",
        "15",
        "16",
        "17",
        "18",
        "19",
        "20",
        "21",
        "22",
        "23",
        "24",
        "25",
    ]
)
ax.set_xlabel("Season [n]")
ax.set_ylabel("Age [yrs]")
bp = ax.boxplot(age_set_tot, patch_artist=True)
colors = ["lightpink"]

for patch in bp["boxes"]:
    patch.set_facecolor(colors[0])
for median in bp["medians"]:
    median.set(color="#FF00FF", linewidth=3)
ax.set_title("Age Distribution per Season")

plt.show()

In [None]:
df = pd.read_csv("bach_data.csv")
g = sns.PairGrid(df, vars=["Age", "Place", "1-on-1 week"])
g.map_diag(sns.histplot, hue=None, color="pink")
g.add_legend()
g.map_upper(sns.kdeplot, shade=True, color="pink")
g.map_lower(sns.kdeplot, shade=True, color="pink")
plt.show()

Here we read in contestant data and train the machine learning linear regression models.

In [None]:
vals = np.c_[age, oneonone]
episode = np.array(place)

train_split = 100
# Split the data into training/testing sets
age_train = vals[:-train_split]
age_test = vals[-train_split:]
# Split the targets into training/testing sets
y_train = episode[:-train_split]
y_test = episode[-train_split:]

# Create linear regression object
regr = linear_model.Ridge(alpha=0.5)

# Train the model using the training sets
regr.fit(age_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(age_test)

# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

### We can now input our new contestant's age to predict their longevity in the season.

In [None]:
age_c = [26, 5]

age_c = np.reshape(age_c, (1, len(age_c)))

y_pred = regr.predict(age_c)
print("Place: %d" %abs(math.ceil(y_pred)))

In [None]:

# Load the diabetes dataset
# age, episode = datasets.load_diabetes(return_X_y=True)

# Use only one feature
# age = age[:, np.newaxis, 2]
vals = np.c_[age, oneonone]
# vals = np.reshape(np.array(vals),(len(vals),1))
episode = np.array(place)

train_split = 100
# Split the data into training/testing sets
age_train = vals[:-train_split]
print(len(age_train)/len(vals))
age_test = vals[-train_split:]
print(len(age_test)/len(vals))
# Split the targets into training/testing sets
y_train = episode[:-train_split]
y_test = episode[-train_split:]

long_term_trend_kernel = 50.0 ** 2 * RBF(length_scale=50.0)

seasonal_kernel = (
    2.0 ** 2
    * RBF(length_scale=100.0)
    * ExpSineSquared(length_scale=1.0, periodicity=1.0, periodicity_bounds="fixed")
)


irregularities_kernel = 0.5 ** 2 * RationalQuadratic(length_scale=1.0, alpha=1.0)


noise_kernel = 0.1 ** 2 * RBF(length_scale=0.1) + WhiteKernel(
    noise_level=0.1 ** 2, noise_level_bounds=(1e-5, 1e5)
)
co2_kernel = (
    long_term_trend_kernel + seasonal_kernel + irregularities_kernel + noise_kernel
)


X = age_train[:,1].reshape(-1, 1)
y = y_train
y_mean = y.mean()
gaussian_process = GaussianProcessRegressor(kernel=co2_kernel, normalize_y=False)
gaussian_process.fit(X, y - y_mean)

In [None]:
mean_y_pred, std_y_pred = gaussian_process.predict(age_test[:,1].reshape(-1,1), return_std=True)
mean_y_pred += y_mean

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, mean_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, mean_y_pred))

plt.plot(X, y, color="black", linestyle="dashed", label="Measurements")
plt.plot(age_test[:,1], mean_y_pred, color="tab:blue", alpha=0.4, label="Gaussian process")

plt.legend()
plt.xlabel("Year")
plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
_ = plt.title(
    "Monthly average of air samples measurements\nfrom the Mauna Loa Observatory"
)

In [None]:
y_pred = gaussian_process.predict(np.array([7]).reshape(-1,1), return_std=False)
print("Place: %d" %abs(math.ceil(y_pred)))