In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; } div.text_cell_render { font-family: 'Human BBY Office'; font-size: 12pt; line-height: 145%;}</style>"))

import random
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

import sklearn as sk
from sklearn import model_selection

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [2]:
def mutate_mock_row(row, target_type, stat_df):
    
    for stat in stat_df.columns.levels[0].tolist():
        mean = stat_df[stat]['mean'].get(target_type)
        std = stat_df[stat]['std'].get(target_type)
        stat_value = random.uniform(max(0, mean-std), mean+std)
        row[stat] = stat_value
        
    return row        

In [3]:
def makeContextualBanditDataFrame(df):
    
    # calculate mean and sd for each column by type
    stat_df = df.groupby('Type').agg({'RI': [np.mean, np.std], 'Na':[np.mean, np.std], 'Mg':[np.mean, np.std],'Al':[np.mean, np.std],'Si':[np.mean, np.std],'K':[np.mean, np.std],
                                  'Ca':[np.mean, np.std],'Ba':[np.mean, np.std],'Fe':[np.mean, np.std]})

    mock_columns = variables + ['TypeShown','Reward']
    mock_df = pd.DataFrame(columns=mock_columns)
    types = df.Type.unique().tolist()
    
    # for each row:
    for index, row in df.iterrows():
        
        new_row = row[variables]
        new_row['TypeShown'] = row['Type']
        new_row['Reward'] = 1
        
        mock_df = mock_df.append(new_row, ignore_index = True)
        
        # for each type ID:
        for atype in types:
            if atype == row['Type']:
                continue
            
            if random.randint(0,1) == 0:
                continue
                
            # adjust each column by a random number bound by [mean-sd, mean+sd], reward = 0 except 10% of the time choose 1
            mock_row = row[variables]
            mock_row['TypeShown'] = atype
            mock_row['Reward'] = 1 if random.randint(1,10) == 1 else 0
            
            mock_row = mutate_mock_row(mock_row, row['Type'], stat_df)
            
            mock_df = mock_df.append(mock_row, ignore_index=True)
    
    # mutate data frame to add probability of showing a type to each row
    type_percentages = mock_df.groupby('TypeShown').agg({'TypeShown': 'count'}).apply(lambda x: x / float(x.sum()))
    percentages = type_percentages['TypeShown'].to_dict()
    mock_df['LikelihoodShown'] = mock_df.apply(lambda row: percentages.get(row.TypeShown), axis=1)
        
    return mock_df.sample(frac=1)

In [4]:
df = pd.read_csv('glass/glass.data', sep=",", header=None, encoding="ISO-8859-1")
df.columns = ['ID','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type']
variables = ['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe']
y = df[['Type']]
X = df.drop(columns=['ID', 'Type'])
df.drop(columns=['ID'])

mock_df = makeContextualBanditDataFrame(df)
mock_df[['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','TypeShown','Reward']]

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,TypeShown,Reward
362,1.520784,13.714309,3.936327,1.402890,72.914540,0.675868,8.507204,0.224389,0.179530,A,0
316,1.515422,13.532184,3.418392,1.404975,73.048032,0.342012,9.698605,0.049188,0.154640,D,0
474,1.518825,13.081642,2.563471,1.243806,72.710322,0.479522,7.985073,0.314893,0.139057,D,0
678,0.638774,14.627663,0.000000,2.043592,74.519520,0.000000,8.375052,1.731712,2.082989,G,0
273,1.520005,13.475909,3.503763,1.358366,72.387720,0.308017,9.341953,0.037611,0.036988,G,0
788,1.517309,14.502847,0.077631,1.729264,72.420118,0.643375,7.610087,0.969620,0.013430,E,0
585,1.517346,13.051771,3.401698,1.435693,72.094157,0.278212,9.004904,0.031172,0.149596,F,0
205,1.520330,13.002035,3.403156,0.963867,73.047197,0.514296,8.353850,0.011431,0.059456,C,0
23,1.518144,13.150725,3.618186,1.024905,72.632774,0.473955,9.078264,0.078035,0.107625,F,0
300,1.519614,12.830521,3.426810,1.707000,72.338240,0.330778,8.102468,0.155558,0.160848,C,0


In [7]:
mock_df.to_csv('glass/automatedPersonalization.data', index_label = 'ID')

In [7]:
#mock_size = mock_df.shape[0]
#mock_df['costs'] = mock_df.apply(lambda row: row.Reward * -1.0 / row.LikelihoodShown if row.Reward != 0 else 0, axis=1)
#mock_df

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,TypeShown,Reward,LikelihoodShown,weights
0,1.521010,13.640000,4.490000,1.100000,71.780000,0.060000,8.750000,0.000000,0.000000,A,1,0.169077,-5.914474
1,1.516779,12.994875,3.371144,0.992187,72.238868,0.612302,9.082342,0.093105,0.058779,B,0,0.173526,0.000000
2,1.519329,13.645154,3.347208,1.096363,72.468493,0.436246,8.349419,0.066603,0.000559,C,0,0.129032,0.000000
3,1.517671,13.428218,3.721861,1.324047,72.475083,0.258079,8.609824,0.028296,0.004015,F,0,0.126808,0.000000
4,1.517610,13.890000,3.600000,1.360000,72.730000,0.480000,7.830000,0.000000,0.000000,A,1,0.169077,-5.914474
5,1.517372,13.471059,3.760553,1.262290,72.701776,0.569941,8.865924,0.037711,0.061372,B,0,0.173526,0.000000
6,1.519062,13.656749,3.452648,1.422111,72.265370,0.512465,8.674903,0.085414,0.088809,C,0,0.129032,0.000000
7,1.520457,12.830342,3.711100,1.259785,72.369072,0.642056,8.916723,0.082846,0.051690,D,0,0.131257,0.000000
8,1.517284,12.955375,3.389746,0.994749,72.502170,0.624487,9.201865,0.087157,0.037455,F,0,0.126808,0.000000
9,1.516180,13.530000,3.550000,1.540000,72.990000,0.390000,7.780000,0.000000,0.000000,A,1,0.169077,-5.914474


In [34]:
# build weights for the cost-sensitive classifier
agg_percentages = mock_df.groupby('TypeShown').agg({'TypeShown': 'count'}).apply(lambda x: float(x.sum()) / x)
weights = agg_percentages['TypeShown'].to_dict()
weights

{'A': 6.152777777777778,
 'B': 6.110344827586207,
 'C': 7.637931034482759,
 'D': 7.145161290322581,
 'E': 7.572649572649572,
 'F': 8.280373831775702,
 'G': 6.661654135338346}

In [24]:
# make X and y where reward == 1
reward_df = mock_df[mock_df.Reward == 1]
reward_df
y = reward_df[['TypeShown']]
X = reward_df.drop(columns=['Reward', 'TypeShown', 'LikelihoodShown'])
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [25]:
costSensitiveClassifier = SVC(kernel='linear', class_weight=weights)
costSensitiveClassifier.fit(X_train, y_train.values.ravel())
clf_score = costSensitiveClassifier.score(X_test, y_test.values.ravel())
print("Cost Sensitive SV Classifier recipe correct percentage: " + '{0:.1f}%'.format(clf_score * 100))

Cost Sensitive SV Classifier recipe correct percentage: 71.1%


In [26]:
# Try data with new inputs resembling Type F
X_random = np.array([[1.50,14.45,2.22,1.61,72.2,0.00,9.2,0.00,0.00]])
print("Decision Tree recipe predicts this data is from type: " + str(costSensitiveClassifier.predict(X_random)[0]))

Decision Tree recipe predicts this data is from type: F
