# 1 - Import libraries & load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

%matplotlib inline

In [2]:
df = pd.read_csv('profiles_clean.csv')

In [3]:
df.columns.values

array(['Unnamed: 0', 'age', 'body_type', 'diet', 'drinks', 'drugs',
       'education', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9', 'ethnicity',
       'height', 'income', 'job', 'last_online', 'location', 'offspring',
       'orientation', 'pets', 'religion', 'sex', 'sign', 'smokes',
       'speaks', 'status', 'drinks_code', 'drugs_code', 'veg_code',
       'ed_code', 'smoke_code', 'sex_code', 'has_kids', 'wants_kids',
       'essay_len'], dtype=object)

# 2 - Formulate Questions

With Machine Learning, I aim to answer the following questions:

1. Can we predict a person's age, given a set of features from their OKCupid profile? (Multiple Linear Regression)
2. Can we pedict whether or not someone has kids, given a set of features from their OKCupid profile? (Random Forest)

# 3 - Answer Questions with Machine Learning

1. Can we predict a person's age, given a set of features from their OKCupid profile? (Multiple Linear Regression)

In [4]:
# pull out just the columns we need for this model
mlr_df = df[['drinks_code', 'drugs_code', 'ed_code', 'smoke_code',
            'has_kids', 'wants_kids', 'age']]

In [5]:
# drop missing values - check out the size before and after
mlr_df.shape

(59937, 7)

In [6]:
mlr_df = mlr_df.dropna()

In [7]:
mlr_df.shape

(14380, 7)

In [8]:
# define & scale features
feature_data = mlr_df[['drinks_code', 'drugs_code', 'ed_code', 'smoke_code',
                       'has_kids', 'wants_kids']]

x = feature_data.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)

feature_data = pd.DataFrame(x_scaled, columns=feature_data.columns)

In [9]:
# define label
label_data = mlr_df[['age']]

In [10]:
# split into initial train & test
x_train, x_test, y_train, y_test = train_test_split(feature_data, label_data, train_size = 0.8, 
                                                    test_size = 0.2, random_state=1)

In [11]:
# split further into train & validate (test withheld until very end)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size = 0.8, 
                                                    test_size = 0.2, random_state=1)

In [12]:
# create & train the model
mlr = LinearRegression()

In [13]:
model = mlr.fit(x_train, y_train)

In [14]:
# evaluate the model
y_predict = mlr.predict(x_val)

In [15]:
print("Train score: ")
print(mlr.score(x_train,y_train))

print("Validation score: ")
print(mlr.score(x_val,y_val)) # Not great scores, but they are fairly similar. That means that we are underfitting
# (high bias) instead of overfitting (high variance). This may be too difficult a problem to solve with our data.
# Let's see how a classification problem goes.

Train score: 
0.381156704012773
Validation score: 
0.39968286090379956


2. Can we pedict whether or not someone has kids, given a set of features from their OKCupid profile? (Random Forest)

In [16]:
# pull out just the columns we need for this model
rf_df = df[['age', 'drinks_code', 'drugs_code', 'ed_code', 'smoke_code', 'sex_code',
            'has_kids', 'wants_kids']]

In [17]:
# drop missing values - check out the size before and after
rf_df.shape

(59937, 8)

In [18]:
rf_df = rf_df.dropna()

In [19]:
rf_df.shape

(14380, 8)

In [20]:
# define features
rf_feature_data = rf_df[['age', 'drinks_code', 'drugs_code', 'ed_code', 'smoke_code', 'sex_code', 'wants_kids']]

In [21]:
# define label
rf_label_data = rf_df[['has_kids']]

In [22]:
# train test split
rf_x_train, rf_x_test, rf_y_train, rf_y_test = train_test_split(rf_feature_data, rf_label_data, train_size = 0.8, 
                                                    test_size = 0.2, random_state=1)

In [23]:
# create & train the model
classifier = RandomForestClassifier(n_estimators = 1000, random_state = 0)

In [24]:
classifier.fit(rf_x_train, rf_y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [25]:
print(classifier.score(rf_x_test, rf_y_test)) # That's actually a pretty great score! We were able to predict
# with 83% accuracy whether or not a person has a kid, based on their OKCupid profile.

0.8320584144645341


In [26]:
y_predict = classifier.predict(rf_x_test)

print(f"Precision: {precision_score(rf_y_test, y_predict)}")

Precision: 0.6828885400313972


In [27]:
print(f"Recall: {recall_score(rf_y_test, y_predict)}")

Recall: 0.6075418994413407
