In [1]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Set random seed
np.random.seed(0)



In [2]:
# Create an object called iris with the iris data
raw = pd.read_csv('../R_ML/r_data_18_35_55_scaled.csv')

# Create a dataframe with the four feature variables
df = pd.DataFrame(raw)

# View the top 5 rows
df.head()

Unnamed: 0,age_label,all_live,magazine,movie_goers,radio,social_media,supermarket,tablet_owner,tv,video_game
0,1,4,5,4,3,4,4,1,3,3
1,1,3,4,4,4,4,4,2,3,3
2,1,4,3,4,4,4,4,2,3,3
3,1,3,4,4,4,4,4,2,3,3
4,1,4,4,4,4,5,4,2,4,4


In [3]:
# Create a new column that for each row, generates a random number between 0 and 1, and
# if that value is less than or equal to .75, then sets the value of that cell as True
# and false otherwise. This is a quick and dirty way of randomly assigning some rows to
# be used as the training data and some as the test data.

df['is_train'] = np.random.uniform(0, 1, len(df)) <= .70

# View the top 5 rows
df.head()

Unnamed: 0,age_label,all_live,magazine,movie_goers,radio,social_media,supermarket,tablet_owner,tv,video_game,is_train
0,1,4,5,4,3,4,4,1,3,3,True
1,1,3,4,4,4,4,4,2,3,3,False
2,1,4,3,4,4,4,4,2,3,3,True
3,1,3,4,4,4,4,4,2,3,3,True
4,1,4,4,4,4,5,4,2,4,4,True


In [4]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [5]:
train

Unnamed: 0,age_label,all_live,magazine,movie_goers,radio,social_media,supermarket,tablet_owner,tv,video_game,is_train
0,1,4,5,4,3,4,4,1,3,3,True
2,1,4,3,4,4,4,4,2,3,3,True
3,1,3,4,4,4,4,4,2,3,3,True
4,1,4,4,4,4,5,4,2,4,4,True
5,1,3,4,4,4,4,4,1,4,3,True
6,1,3,4,4,4,4,4,2,4,3,True
9,1,3,3,4,4,5,4,3,3,4,True
11,2,3,4,4,4,4,4,2,4,3,True
12,2,4,4,4,4,4,4,2,4,3,True
14,2,3,4,4,4,4,4,3,4,4,True


In [6]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 127
Number of observations in the test data: 48


In [7]:
# Create a list of the feature column's names
features = df.columns[1:]

# View features
features

Index(['all_live', 'magazine', 'movie_goers', 'radio', 'social_media',
       'supermarket', 'tablet_owner', 'tv', 'video_game', 'is_train'],
      dtype='object')

In [8]:
y = np.array(train.age_label)
y

array([1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3,
       3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3], dtype=int64)

In [9]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[['tv','social_media','magazine','all_live','video_game','tablet_owner']], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [10]:
test[['tv','social_media','magazine','all_live','video_game','tablet_owner']].head()

Unnamed: 0,tv,social_media,magazine,all_live,video_game,tablet_owner
1,3,4,4,3,3,2
7,3,4,4,4,3,2
8,4,4,3,3,3,2
10,4,4,4,3,3,2
13,4,4,4,3,3,3


In [11]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
preds = clf.predict(test[['tv','social_media','magazine','all_live','video_game','tablet_owner']])


In [12]:
clf.predict_proba(test[['tv','social_media','magazine','all_live','video_game','tablet_owner']])[0:5]

array([[ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.81844269,  0.18155731,  0.        ],
       [ 0.68186202,  0.31813798,  0.        ],
       [ 0.06818182,  0.43181818,  0.5       ]])

In [13]:
pd.crosstab(test['age_label'], preds, rownames=['Actual Generation'], colnames=['Predicted Generation'])

Predicted Generation,1,2,3
Actual Generation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,11,1,0
2,2,19,1
3,0,1,13


In [14]:
test_array = [0.75,0.90,0.5,0.6,0.5]
pd.DataFrame(np.array([0.75,0.90,0.5,0.6,0.5]).reshape(1,5),columns=['tv','social_media','magazine','all_live','video_game','tablet_owner'])

ValueError: Shape of passed values is (5, 1), indices imply (6, 1)

In [None]:
TV = float(input('TV: '))
social_media = float(input('social media: '))
magazine = float(input('Magazine: '))
all_live = float(input('All_live: '))
video_game = float(input('video_game: '))
tablet_owner = float(input("tablet_owner: "))


In [None]:
testing_df = pd.DataFrame(np.array([TV,social_media,magazine,all_live,video_game,tablet_owner]).reshape(1,6),columns=['tv','social_media','magazine','all_live','video_game','tablet_owner'])
print('Chosen Levels: \ntv: {}\nsocial_media: {}\
        \nmagazine: {}\nall_live: {}\nvideo_game: {}\ntablet_owner: {}\
        \nPredicted Class:{}'.format(TV,social_media,magazine,all_live,video_game,tablet_owner,clf.predict(testing_df)[0]))

In [None]:
from sklearn.externals import joblib
joblib.dump(clf, 'scale_model_RF.pkl') 

In [None]:
# for i in range(0,6):
#     for j in range(0,6):
#         for k in range(0,6):
#             for l in range(0,6):
#                 for m in range(0,6):
#                     for n in range(0,6):
                        

In [27]:
x = {"tv":[1,2,3,4,5], "social_media":[1,2,3,4,5], "magazine":[1,2,3,4,5],\
     "all_live":[1,2,3,4,5], 'video_game':[1,2,3,4,5],'tablet_owner':[1,2,3,4,5]}

In [23]:
import itertools

In [29]:
input_df = pd.DataFrame(list(itertools.product(*x.values())), columns=x.keys())

In [None]:
# import pickle
# import math
# object_pi = math.pi
# file_pi = open('filename_pi.obj', 'w')
# pickle.dump(object_pi, file_pi)


In [32]:
# all possible result
Category = clf.predict(input_df[['tv','social_media','magazine','all_live','video_game','tablet_owner']])
len(Category)

15625

In [37]:
input_df['result']=Category
input_df['pref_ID']=''
input_df


Unnamed: 0,tv,social_media,magazine,all_live,video_game,tablet_owner,result,pref_ID
0,1,1,1,1,1,1,1,
1,1,1,1,1,1,2,1,
2,1,1,1,1,1,3,3,
3,1,1,1,1,1,4,3,
4,1,1,1,1,1,5,3,
5,1,1,1,1,2,1,1,
6,1,1,1,1,2,2,1,
7,1,1,1,1,2,3,1,
8,1,1,1,1,2,4,1,
9,1,1,1,1,2,5,1,


In [40]:
# for index, row in input_df.iterrows():
#     row[-1]=0
#     #row[-1]=str(row[0])+str(row[1])+str(row[2])+str(row[3])+str(row[4])+str(row[5])
# input_df

Unnamed: 0,tv,social_media,magazine,all_live,video_game,tablet_owner,result,pref_ID
0,1,1,1,1,1,1,1,
1,1,1,1,1,1,2,1,
2,1,1,1,1,1,3,3,
3,1,1,1,1,1,4,3,
4,1,1,1,1,1,5,3,
5,1,1,1,1,2,1,1,
6,1,1,1,1,2,2,1,
7,1,1,1,1,2,3,1,
8,1,1,1,1,2,4,1,
9,1,1,1,1,2,5,1,


In [42]:
input_df['pref_ID'] = input_df["tv"].map(str)+input_df["social_media"].map(str)+\
                        input_df["magazine"].map(str)+input_df["all_live"].map(str)+\
                        input_df["video_game"].map(str)+ input_df["tablet_owner"].map(str)
input_df

Unnamed: 0,tv,social_media,magazine,all_live,video_game,tablet_owner,result,pref_ID
0,1,1,1,1,1,1,1,111111
1,1,1,1,1,1,2,1,111112
2,1,1,1,1,1,3,3,111113
3,1,1,1,1,1,4,3,111114
4,1,1,1,1,1,5,3,111115
5,1,1,1,1,2,1,1,111121
6,1,1,1,1,2,2,1,111122
7,1,1,1,1,2,3,1,111123
8,1,1,1,1,2,4,1,111124
9,1,1,1,1,2,5,1,111125


In [43]:
input_df.to_csv('all_possible_prediction.csv',index=False)