In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

RANDOM_STATE = 1

In [113]:
from sklearn.preprocessing import LabelEncoder
import numpy as np


class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [66]:
dataset = pd.read_csv('./datasets/attis-ticket-1k.csv')

In [67]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
account_id    10000 non-null int64
country       10000 non-null object
city          10000 non-null object
platform      10000 non-null object
os            10000 non-null object
browser       10000 non-null object
domain_id     10000 non-null int64
group_id      10000 non-null int64
case          10000 non-null int64
dtypes: int64(4), object(5)
memory usage: 703.2+ KB


In [68]:
X = dataset.iloc[:,0:-1].values
Y = dataset.iloc[:,-1].values

In [115]:
label_features = [0,1,2,3,4,5,6,7]
label_features_obj = []
onehot_features = [0,1,2,3,4,5,6,7]


for column in label_features:
    labelencoder = LabelEncoderExt()
    labelencoder.fit(X[:, column])
    X[:, column] = labelencoder.transform(X[:, column])
    label_features_obj.append(labelencoder)
    
X[0]

ejecutar todo denuevo!



array([2., 2., 2., ..., 0., 0., 1.])

In [70]:
encoder = OneHotEncoder(handle_unknown='ignore', categories='auto')
X = encoder.fit_transform(X).toarray()
X.shape

(10000, 2842)

In [71]:
encoder.categories_

[array([0, 1, 2, ..., 2113, 2114, 2115], dtype=object),
 array([0, 1, 2, 3, 4, 5, 6], dtype=object),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
        53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
        70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
        87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
        103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
        116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
        129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
        142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
        155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
        

In [48]:
#scaler_x = StandardScaler()
#X = scaler_x.fit_transform(X)



In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = RANDOM_STATE)

In [78]:
# RANDOMFOREST Example
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
parameters = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 3, 10],
    'random_state': [RANDOM_STATE]
}

grid_search = GridSearchCV(estimator = classifier, 
                           param_grid = parameters, 
                           scoring = 'accuracy', 
                           cv = 10,
                           n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

print("Best parameter:", best_parameters)
print("Best Accuracy:", best_accuracy)

Best parameter: {'max_depth': 3, 'n_estimators': 100, 'random_state': 1}
Best Accuracy: 0.84


In [108]:
test = np.array([["169980427","Brazil","São Paulo","Mobile","Android ","Chrome ",2,9]], dtype=object) # 1 success

best_model = grid_search.best_estimator_ 

In [109]:
test[:, 0]

array(['169980427'], dtype=object)

In [112]:
i=0
for column in label_features:
    try:
        test[:, column] = label_features_obj[i].transform(test[:, column])
        i+=1
    except Exception as ex:
        template = "An exception of type {0} occurred. Arguments:\n{1!r}"
        message = template.format(type(ex).__name__, ex.args)
        print(message)

An exception of type ValueError occurred. Arguments:
("y contains previously unseen labels: '169980427'",)
An exception of type ValueError occurred. Arguments:
("y contains previously unseen labels: 'Brazil'",)
An exception of type ValueError occurred. Arguments:
("y contains previously unseen labels: 'São Paulo'",)
An exception of type ValueError occurred. Arguments:
("y contains previously unseen labels: 'Mobile'",)
An exception of type ValueError occurred. Arguments:
("y contains previously unseen labels: 'Android '",)
An exception of type ValueError occurred. Arguments:
("y contains previously unseen labels: 'Chrome '",)
An exception of type ValueError occurred. Arguments:
('y contains previously unseen labels: 2',)
An exception of type ValueError occurred. Arguments:
('y contains previously unseen labels: 9',)


In [103]:
i=0
for column in label_features:
    try:
        test[:, column] = label_features_obj[i].fit_transform(test[:, column])
        i+=1
    except:
        label_features_obj[i].classes_ = np.append(le.classes_, test[:, column])

print(1, test)

test = encoder.transform(test).toarray()

print(2, test)

y_pred = best_model.predict(test)
print(3, y_pred)


1 [[0 0 0 0 0 0 0 0]]
2 [[1. 0. 0. ... 0. 0. 0.]]
3 [0]
