## Import Libraries & Data 

In [1]:
import re
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import matplotlib.pyplot as plt
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
import seaborn as sns
%xmode minimal

# import data into dataframes
gen_none = pd.read_csv('data/gen_none.csv')
gen_low = pd.read_csv('data/gen_low.csv') #, encoding='ISO-8859-1')
gen_high = pd.read_csv('data/gen_high.csv')

# define RSME function 
def rmse(predicted, actual):
    return np.sqrt(np.mean((actual - predicted)**2))

Exception reporting mode: Minimal


## Data Cleaning

In [2]:
# drop name columns
gen_none.drop(columns=["name"]);
gen_low.drop(columns=["name"]);
gen_high.drop(columns=["name"]);

gen_none["sidebar_color"] = gen_none["sidebar_color"].astype(str)

# convert hex code colors to base 10 values
gen_none.loc[gen_none["sidebar_color"].str.contains("\+", regex=True), "sidebar_color"] = "0"
gen_none.loc[gen_none["sidebar_color"].str.count("\w") != 6, "sidebar_color"] = "0"
gen_none["sidebar_color"] = gen_none["sidebar_color"].astype(str).apply(int, base=16)

# convert color bins to numeric values
gen_low.loc[gen_low["sidebar_color"] == "other", "sidebar_color"] = 0
gen_low.loc[gen_low["sidebar_color"] == "blue", "sidebar_color"] = 1
gen_low.loc[gen_low["sidebar_color"] == "white", "sidebar_color"] = 2

# convert color bins to numeric values
gen_high.loc[gen_high["sidebar_color"] == "other", "sidebar_color"] = 0
gen_high.loc[gen_high["sidebar_color"] == "blue", "sidebar_color"] = 1

# Low Generalization

In [4]:
# train model 
train, test = train_test_split(gen_low, train_size=0.8)
linear_model = lm.LinearRegression(fit_intercept=True)
X_train = train[["description", "fav_number", "tweet_count", "sidebar_color"]] 
y_train = train["gender"]

X_test = test[["description", "fav_number", "tweet_count", "sidebar_color"]] 
y_test = test["gender"]

In [5]:
# fit model 
linear_model.fit(X_train, y_train)
y_fitted = linear_model.predict(X_train)
y_predicted = linear_model.predict(X_test)

In [6]:
# training and validation errors for model
training_error = rmse(y_fitted, y_train)
validation_error = rmse(y_predicted, y_test)

theta0 = linear_model.intercept_
theta1, theta2, theta3, theta4 = linear_model.coef_

### Results:

In [7]:
print("Low Gen Model\nTraining RMSE: {}\nValidation RMSE: {}\n".format(training_error, validation_error))
print("Low Gen Model\nθ_0: {}\nθ_1: {}\nθ_2: {}\nθ_3: {}\nθ_4: {}".format(theta0, theta1, theta2, theta3, theta4))

Low Gen Model
Training RMSE: 0.49338750598881753
Validation RMSE: 0.49308401640992805

Low Gen Model
θ_0: 0.5567898017983609
θ_1: -0.05647142249573999
θ_2: 0.04366222676906856
θ_3: -0.005105898881336057
θ_4: -0.010940951667887455


# High Generalization

In [8]:
# train model 
train2, test2 = train_test_split(gen_high, train_size=0.8)
linear_model2 = lm.LinearRegression(fit_intercept=True)
X_train2 = train2[["description", "fav_number", "tweet_count", "sidebar_color"]] 
y_train2 = train2["gender"]

X_test2 = test2[["description", "fav_number", "tweet_count", "sidebar_color"]] 
y_test2 = test2["gender"]

In [9]:
# fit model 
linear_model2.fit(X_train2, y_train2)
y_fitted2 = linear_model2.predict(X_train2)
y_predicted2 = linear_model2.predict(X_test2)

In [10]:
# training and validation errors for model
training_error2 = rmse(y_fitted2, y_train2)
validation_error2 = rmse(y_predicted2, y_test2)

theta2_0 = linear_model2.intercept_
theta2_1, theta2_2, theta2_3, theta2_4 = linear_model2.coef_

### Results:

In [11]:
print("High Gen Model\nTraining RMSE: {}\nValidation RMSE: {}\n".format(training_error2, validation_error2))
print("High Gen Model\nθ_0: {}\nθ_1: {}\nθ_2: {}\nθ_3: {}\nθ_4: {}".format(theta2_0, theta2_1, theta2_2, theta2_3, theta2_4))

High Gen Model
Training RMSE: 0.4917101180253689
Validation RMSE: 0.49546313842886863

High Gen Model
θ_0: 0.5865798521756029
θ_1: -0.1331294795167198
θ_2: 0.09024443971092089
θ_3: -0.0258923332622665
θ_4: -0.08003208729472291


# No Generalization

In [12]:
# train model 
train3, test3 = train_test_split(gen_none, train_size=0.8)
linear_model3 = lm.LinearRegression(fit_intercept=True)
X_train3 = train3[["description", "fav_number", "tweet_count", "sidebar_color"]] 
y_train3 = train3["gender"]

X_test3 = test3[["description", "fav_number", "tweet_count", "sidebar_color"]] 
y_test3 = test3["gender"]

In [13]:
# fit model 
linear_model3.fit(X_train3, y_train3)
y_fitted3 = linear_model3.predict(X_train3)
y_predicted3 = linear_model3.predict(X_test3)

In [14]:
# training and validation errors for model
training_error3 = rmse(y_fitted3, y_train3)
validation_error3 = rmse(y_predicted3, y_test3)

theta3_0 = linear_model3.intercept_
theta3_1, theta3_2, theta3_3, theta3_4 = linear_model3.coef_

In [17]:
print("No Gen Model\nTraining RMSE: {}\nValidation RMSE: {}\n".format(training_error3, validation_error3))
print("No Gen Model\nθ_0: {}\nθ_1: {}\nθ_2: {}\nθ_3: {}\nθ_4: {}".format(theta3_0, theta3_1, theta3_2, theta3_3, theta3_4))

No Gen Model
Training RMSE: 0.49504904624997764
Validation RMSE: 0.49568135870372076

No Gen Model
θ_0: 0.6037707577173775
θ_1: -0.0011196104347496786
θ_2: 2.174453038679891e-06
θ_3: -2.2060094430893572e-07
θ_4: -1.0152486442641776e-09


In [16]:
print("No Gen Model\nTraining RMSE: {}\nValidation RMSE: {}\n".format(training_error3, validation_error3))

print("Low Gen Model\nTraining RMSE: {}\nValidation RMSE: {}\n".format(training_error, validation_error))

print("High Gen Model\nTraining RMSE: {}\nValidation RMSE: {}\n".format(training_error2, validation_error2))


No Gen Model
Training RMSE: 0.49504904624997764
Validation RMSE: 0.49568135870372076

Low Gen Model
Training RMSE: 0.49338750598881753
Validation RMSE: 0.49308401640992805

High Gen Model
Training RMSE: 0.4917101180253689
Validation RMSE: 0.49546313842886863

