In [1]:
import random
import numpy as np
import pandas as pd

In [4]:
# Generate clustering data
# This is super easy to cluster but works for demo purposes

topics = [
    ['crime coefficient', 'criminal', 'formula', 'function',
     'compute', 'criminals', 'crime', 'crimes', 'algorithm', 'policy',
     'police', 'genetic', 'criminality', 'profile', 'heart rate', 'friends',
     'arrest', 'data', 'datapoint', 'datapoints', 'bias', 'threat', 'menace', 'society',
     'threatening', 'danger', 'those people', 'legal', 'convict', 'public', 'safety'],
    ['meeting', 'press release', 'speech', 'presentation', 'schedule', 'present',
     'public', 'talk', 'keynote', 'journalists', 'memo', 'points', 'agenda', 'people',
     'support', 'appearance', 'media', 'tv', 'radio', 'news', 'image'],
    ['dinner', 'meal', 'cook', 'order', 'take out', 'lunch', 'drinks', 'menu'],
    ['candidacy', 'election', 'campaign', 'campaigning', 'points', 'polls', 'ahead',
     'behind', 'politician', 'politics', 'finance', 'fund', 'pac', 'vote', 'voters',
     'citizens', 'voting', 'voter', 'appeal'],
    ['scandal', 'leak', 'threat', 'derail', 'cover up', 'news', 'journalists', 'whistleblower',
     'take care of it', 'fire', 'embarassment', 'danger', 'undermine', 'image', 'appeal']
]

# names generated with http://www.behindthename.com/random/
authors = [
    'Torkel Whitney',
    'Sheba Mate',
    'Sabine Finnian',
    'Dustin Randell',
    'Sebastian Jaiden',
    'Izabelle Rene',
    'Harrison Chip',
    'Francine Loyd',
    'Reid Kiley',
    'Ian Upton'
]
cc_authors = random.sample(authors, 5)

n = 10000
docs = []

for i in range(n):
    # Choose a topic
    if random.random() < 0.05:
        topic = topics[0]
        author = random.choice(cc_authors)
    else:
        topic = random.choice(topics[1:])
        author = random.choice(authors)
    doc_len = random.randint(10,30)
    doc = [random.choice(topic) for j in range(doc_len)]
    docs.append({
        'author': author,
        'email': ' '.join(doc)
    })
    
df = pd.DataFrame(docs)
df.to_csv('emails.csv', index=False)
df.head()

Unnamed: 0,author,email
0,Sebastian Jaiden,fire threat news derail image derail cover up ...
1,Francine Loyd,people people support presentation presentatio...
2,Harrison Chip,threat fire scandal undermine undermine danger...
3,Sebastian Jaiden,press release support public present journalis...
4,Harrison Chip,danger image cover up whistleblower whistleblo...


In [13]:
# Generate regression data

# avg heart rate, criminal relative, # Facebook friends, age, income (in thousands)
true_params = np.array([3, 120, -0.4, 0, -0.1])

def f(X):
    """
    The true function, with some noise
    """
    return np.dot(true_params, X.T) + np.random.normal(0, 8, X.shape[0])


"""
Simulate data

Features:
- current heart rate
- has criminal relative
- number of Facebook friends
- age
- income (in thousands)
"""
n_samples = 10000
X = np.hstack([
        np.random.normal(70, 10, (n_samples, 1)),
        np.random.binomial(1, 0.2, (n_samples, 1)),
        np.random.binomial(600, 0.7, (n_samples, 1)),
        np.random.normal(35, 10, (n_samples, 1)),
        np.random.lognormal(0, 1, (n_samples, 1)) + 10,
    ])

# 20% have very few friends
n_friendless = int(n_samples * 0.2)
X[np.random.choice(X.shape[0], n_friendless),2] = np.random.binomial(600, 0.5, (n_friendless,))

# Compute true values
y = f(X)
crime_coef = y[:,np.newaxis]



# Format wrangling
data = np.hstack([X, crime_coef])
df = pd.DataFrame(data, columns=['heart_rate',
                                 'has_criminal_relative',
                                 'num_facebook_friends',
                                 'age',
                                 'income',
                                 'crime_coef'])
df.to_csv('crimecoef.csv', index=False)
df.head()

Unnamed: 0,heart_rate,has_criminal_relative,num_facebook_friends,age,income,crime_coef
0,66.996933,0.0,435.0,17.453327,10.641069,15.077597
1,62.273825,0.0,417.0,30.562607,15.581992,17.715259
2,70.472895,1.0,430.0,30.837655,12.025393,169.457692
3,54.795384,0.0,303.0,31.684339,11.839258,41.40488
4,84.996298,0.0,288.0,27.302402,11.081711,133.992943


In [12]:
# Generate regression data

def f(X):
    heartrate, criminal_relatives, friends, age = X.T
    return 10*(10*criminal_relatives)**2
    

"""
Simulate data

Features:
- heartrate
- percent criminal relative
- number of Facebook friends
- age
"""
n_samples = 10000
X = np.hstack([
        np.random.normal(70, 10, (n_samples, 1)),
        np.random.beta(2, 10, (n_samples, 1)),
        np.random.binomial(600, 0.7, (n_samples, 1)),
        np.random.normal(35, 10, (n_samples, 1)),
    ])

# Compute true values
y = f(X)
crime_coef = y[:,np.newaxis]

# Format wrangling
data = np.hstack([X, crime_coef])
df = pd.DataFrame(data, columns=['heart_rate',
                                 'criminal_relative',
                                 'friends',
                                 'age',
                                 'crime_coef'])
df.to_csv('crimecoef_nonlinear.csv', index=False)
df.head()

Unnamed: 0,heart_rate,criminal_relative,friends,age,crime_coef
0,71.35286,0.032889,414.0,43.039801,1.081684
1,49.609125,0.10091,423.0,30.572606,10.182877
2,52.709426,0.041856,417.0,36.024089,1.751955
3,74.092361,0.156625,408.0,55.548839,24.531254
4,75.067936,0.158631,416.0,44.582839,25.163919


In [58]:
# Generate classification data

# The crime coefficients are now the predictors
criminal_behavior = np.random.binomial(1, 0.2, (n_samples, 1))
hue = np.random.beta(2, 5, (n_samples, 1)) * 255
data = np.hstack([crime_coef, hue, criminal_behavior])

df = pd.DataFrame(data, columns=['crime_coef',
                                 'hue',
                                 'criminal_behavior'])
df.to_csv('criminalbehavior.csv', index=False)
df.head()

Unnamed: 0,crime_coef,hue,criminal_behavior
0,10.099342,62.973679,0
1,49.455816,15.848373,0
2,39.200911,16.542826,1
3,39.941107,86.619647,1
4,29.95706,32.906168,1


In [None]:
# Deer regression data
# True function, w/ some noise
def f(x):
    return 1.8 * x + np.random.normal(0, 12, x.shape[0]) + 50

xs = np.linspace(50,150,100)
ys = f(xs)
data = np.vstack([xs, ys]).T
df = pd.DataFrame(data, columns=['Weight', 'Height'])
df.to_csv('deer_data.csv', index=False)

In [49]:
# Deer classification data
from sklearn.datasets import make_classification

data, labels = make_classification(n_samples=100, n_features=2,
                                   n_informative=2, n_redundant=0,
                                   n_classes=2, scale=10, shift=20,
                                   n_clusters_per_class=1, class_sep=2)
data = np.hstack([data, labels[:,None]])
df = pd.DataFrame(data, columns=['Weight', 'Height', 'Species'])
df.to_csv('deer_species_data.csv', index=False)
df.head()

Unnamed: 0,Weight,Height,Species
0,188.69308,220.523927,1
1,218.089266,173.806123,0
2,185.550693,218.227736,1
3,183.831915,220.421194,1
4,167.150952,216.111407,1


In [55]:
# Deer clustering data
from sklearn.datasets import make_blobs

data, labels = make_blobs(n_samples=100, n_features=2, centers=3, center_box=(150,250), cluster_std=5)
df = pd.DataFrame(data, columns=['Weight', 'Height'])
df.to_csv('deer_cluster_data.csv', index=False)
df.head()

Unnamed: 0,Weight,Height
0,214.586835,168.378228
1,247.17177,196.529935
2,216.964749,169.599983
3,240.21113,196.495348
4,185.683348,160.969367
