In [1]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Set random seed
np.random.seed(0)

In [2]:
# Create an object called iris with the iris data
raw = pd.read_csv('../R_ML/r_data_18_35_55.csv')

# Create a dataframe with the four feature variables
df = pd.DataFrame(raw)

# View the top 5 rows
df.head()

Unnamed: 0,age_label,all_live,magazine,movie_goers,radio,social_media,supermarket,tablet_owner,tv,video_game
0,1,0.599194,1.0,1.0,0.661694,0.942742,0.971371,0.173387,0.745968,0.719355
1,1,0.413265,0.755102,1.0,0.67602,0.936224,0.946429,0.280612,0.729592,0.658163
2,1,0.690476,0.712121,1.0,0.670996,0.942641,0.984848,0.428571,0.642857,0.601732
3,1,0.569312,0.803175,1.0,0.784656,0.981481,0.962963,0.358201,0.671958,0.640212
4,1,0.690476,0.828571,1.0,0.690476,1.0,0.966667,0.342857,0.861905,0.928571


In [6]:
# Create a new column that for each row, generates a random number between 0 and 1, and
# if that value is less than or equal to .75, then sets the value of that cell as True
# and false otherwise. This is a quick and dirty way of randomly assigning some rows to
# be used as the training data and some as the test data.

df['is_train'] = np.random.uniform(0, 1, len(df)) <= .70

# View the top 5 rows
df.head()

Unnamed: 0,age_label,all_live,magazine,movie_goers,radio,social_media,supermarket,tablet_owner,tv,video_game,is_train
0,1,0.599194,1.0,1.0,0.661694,0.942742,0.971371,0.173387,0.745968,0.719355,True
1,1,0.413265,0.755102,1.0,0.67602,0.936224,0.946429,0.280612,0.729592,0.658163,True
2,1,0.690476,0.712121,1.0,0.670996,0.942641,0.984848,0.428571,0.642857,0.601732,False
3,1,0.569312,0.803175,1.0,0.784656,0.981481,0.962963,0.358201,0.671958,0.640212,False
4,1,0.690476,0.828571,1.0,0.690476,1.0,0.966667,0.342857,0.861905,0.928571,True


In [7]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [17]:
train

Unnamed: 0,age_label,all_live,magazine,movie_goers,radio,social_media,supermarket,tablet_owner,tv,video_game,is_train
0,1,0.599194,1.000000,1.000000,0.661694,0.942742,0.971371,0.173387,0.745968,0.719355,True
1,1,0.413265,0.755102,1.000000,0.676020,0.936224,0.946429,0.280612,0.729592,0.658163,True
4,1,0.690476,0.828571,1.000000,0.690476,1.000000,0.966667,0.342857,0.861905,0.928571,True
5,1,0.479855,0.973543,1.000000,0.800161,0.930432,1.000000,0.269272,0.817754,0.677814,True
6,1,0.507843,0.793464,1.000000,0.819935,0.979085,0.990196,0.473856,0.833987,0.676797,True
7,1,0.771226,0.762242,1.000000,0.733491,0.990566,1.000000,0.524034,0.736972,0.641734,True
8,1,0.505626,0.733755,1.000000,0.760619,0.955556,1.000000,0.449930,0.754290,0.626160,True
10,2,0.447717,0.970764,1.000000,0.790440,0.855954,0.984848,0.357341,0.913359,0.610222,True
11,2,0.556277,0.861472,1.000000,0.770996,0.896970,0.987013,0.498701,0.861905,0.613853,True
12,2,0.683454,0.794423,1.000000,0.793356,0.925452,1.000000,0.541400,0.819676,0.544174,True


In [8]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 127
Number of observations in the test data: 48


In [9]:
# Create a list of the feature column's names
features = df.columns[1:]

# View features
features

Index(['all_live', 'magazine', 'movie_goers', 'radio', 'social_media',
       'supermarket', 'tablet_owner', 'tv', 'video_game', 'is_train'],
      dtype='object')

In [18]:
y = np.array(train.age_label)
y

array([1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 3, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3], dtype=int64)

In [19]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[['tv','social_media','radio','video_game','tablet_owner']], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [24]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
preds = clf.predict(test[['tv','social_media','radio','video_game','tablet_owner']])


In [23]:
clf.predict_proba(test[['tv','social_media','radio','video_game','tablet_owner']])[0:]

array([[ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0.5,  0.5,  0. ],
       [ 0. ,  0.9,  0.1],
       [ 0. ,  0.9,  0.1],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0.1,  0.8,  0.1],
       [ 0. ,  0. ,  1. ],
       [ 0.1,  0.1,  0.8],
       [ 1. ,  0. ,  0. ],
       [ 0.8,  0.2,  0. ],
       [ 0.7,  0.3,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  0.5,  0.5],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0.9,  0.1,  0. ],
       [ 0.9,  0.1,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0.9,  0.1,  0. ],
       [ 0.8,  0.2,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0.1,  0.9,  0. ],
       [ 0.2,  0.8,  0. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0.2,  0.8],
 

In [25]:
pd.crosstab(test['age_label'], preds, rownames=['Actual Generation'], colnames=['Predicted Generation'])

Predicted Generation,1,2,3
Actual Generation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,22,0,0
2,1,16,0
3,0,0,9


In [None]:
clf.predict([])