<a href="https://colab.research.google.com/github/farisazizy/Artificial-Intelligence/blob/machine-learning/classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#Import Library
%tensorflow_version 2.x

# tensorflow hub
import tensorflow_hub as hub
# tensor flow module
import tensorflow as tf
import tensorflow_probability as tfp

# matplotlib
from matplotlib import colors
from matplotlib import pyplot as plt

import pandas as pd
import numpy as np

## Data Cleaning

In [4]:
train = pd.read_csv("kendaraan_train.csv")
test = pd.read_csv("kendaraan_test.csv")

# Melakukan drop pada baris yang memiliki nilai null
train = train.dropna()
train.drop(["id"], axis=1,  inplace=True)
test = test.dropna()
train.reset_index(drop=True, inplace=True)

print(train.head())
print(test.head())

  Jenis_Kelamin  Umur  SIM  ...  Kanal_Penjualan  Lama_Berlangganan Tertarik
0        Wanita  30.0  1.0  ...            152.0               97.0      0.0
1          Pria  48.0  1.0  ...             29.0              158.0      0.0
2        Wanita  58.0  1.0  ...            124.0               63.0      0.0
3          Pria  21.0  1.0  ...            152.0              171.0      0.0
4        Wanita  20.0  1.0  ...            160.0               31.0      0.0

[5 rows x 11 columns]
  Jenis_Kelamin  Umur  SIM  ...  Kanal_Penjualan  Lama_Berlangganan Tertarik
0        Wanita    49    1  ...               26                145        0
1          Pria    22    1  ...              152                241        0
2          Pria    24    1  ...              152                 62        0
3          Pria    46    1  ...              124                 34        0
4          Pria    35    1  ...              152                229        0

[5 rows x 11 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [5]:
CATEGORICAL_COLUMNS = ['Jenis_Kelamin', 'Umur_Kendaraan', 'Kendaraan_Rusak']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = train[feature_name].unique()  # gets a list of all unique values from given feature column
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

print(feature_columns)

[VocabularyListCategoricalColumn(key='Jenis_Kelamin', vocabulary_list=('Wanita', 'Pria'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='Umur_Kendaraan', vocabulary_list=('< 1 Tahun', '> 2 Tahun', '1-2 Tahun'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='Kendaraan_Rusak', vocabulary_list=('Tidak', 'Pernah'), dtype=tf.string, default_value=-1, num_oov_buckets=0)]


In [6]:
object_column = train.select_dtypes(['object']).columns
category_column = train.select_dtypes(['category']).columns

train[object_column]=train[object_column].apply(lambda x: x.astype('category'))

train[object_column] = train[object_column].apply(lambda x: x.cat.codes)
train[category_column] = train[category_column].apply(lambda x: x.cat.codes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [7]:
object_column = test.select_dtypes(['object']).columns
category_column = test.select_dtypes(['category']).columns

test[object_column] = test[object_column].apply(lambda x: x.astype('category'))

test[object_column] = test[object_column].apply(lambda x: x.cat.codes)
test[category_column] = test[category_column].apply(lambda x: x.cat.codes)

In [8]:
train.head()

Unnamed: 0,Jenis_Kelamin,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik
0,1,30.0,1.0,33.0,1.0,1,1,28029.0,152.0,97.0,0.0
1,0,48.0,1.0,39.0,0.0,2,0,25800.0,29.0,158.0,0.0
2,1,58.0,1.0,48.0,0.0,0,1,2630.0,124.0,63.0,0.0
3,0,21.0,1.0,35.0,1.0,1,1,22735.0,152.0,171.0,0.0
4,1,20.0,1.0,8.0,1.0,1,1,30786.0,160.0,31.0,0.0


In [9]:
train_y = train.pop('Tertarik').astype(int)
test_y = test.pop('Tertarik').astype(int)
train.head() # Kolom tertarik dihilangkan untuk fitur

Unnamed: 0,Jenis_Kelamin,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Premi,Kanal_Penjualan,Lama_Berlangganan
0,1,30.0,1.0,33.0,1.0,1,1,28029.0,152.0,97.0
1,0,48.0,1.0,39.0,0.0,2,0,25800.0,29.0,158.0
2,1,58.0,1.0,48.0,0.0,0,1,2630.0,124.0,63.0
3,0,21.0,1.0,35.0,1.0,1,1,22735.0,152.0,171.0
4,1,20.0,1.0,8.0,1.0,1,1,30786.0,160.0,31.0


# Preparing Algorithm

### Input Function

In [10]:
def input_fn(features, labels, training=True, batch_size=256):
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)

### Feature Columns

In [11]:
my_feature_columns = []
for key in train.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
print(my_feature_columns)

[NumericColumn(key='Jenis_Kelamin', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Umur', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='SIM', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Kode_Daerah', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Sudah_Asuransi', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Umur_Kendaraan', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Kendaraan_Rusak', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Premi', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Kanal_Penjualan', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Lama_Berlangganan', shape=(1,), default_value=Non

# Building the Model
And now we are ready to choose a model. For classification tasks there are variety of different estimators/models that we can pick from. Some options are listed below.
- ```DNNClassifier``` (Deep Neural Network)
- ```LinearClassifier```

We can choose either model but the DNN seems to be the best choice. This is because we may not be able to find a linear coorespondence in our data. 

So let's build a model!

## DNN

In [12]:
# Build a DNN with 2 hidden layers with 30 and 10 hidden nodes each.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 30 and 10 nodes respectively.
    hidden_units=[30, 10],
    # The model must choose between 3 classes.
    n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpi_dwd2q1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


###Training
Now it's time to train the model!

In [13]:
classifier.train(
    input_fn=lambda: input_fn(train, train_y, training=True),
    steps=5000)
# We include a lambda to avoid creating an inner function previously

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpi_dwd2q1/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 3617.1353, step = 0
INFO:tensorflow:global_step/sec: 217.824
INFO:tensorflow:loss = 5.2252655, step = 100 (0.462 sec)
INFO:tensorflow:global_step/sec: 235.274
INFO:tensorflow:loss = 6.9221797, step = 200 (0.425 sec)
INFO:tensorflow:global_step/sec

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7ff639c53b10>

In [14]:
eval_result = classifier.evaluate(
    input_fn=lambda: input_fn(test, test_y, training=False))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
eval_result

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-12-10T09:26:38
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpi_dwd2q1/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1.80066s
INFO:tensorflow:Finished evaluation at 2021-12-10-09:26:40
INFO:tensorflow:Saving dict for global step 5000: accuracy = 0.84011, accuracy_baseline = 0.8769705, auc = 0.5640794, auc_precision_recall = 0.15830769, average_loss = 2.9093788, global_step = 5000, label/mean = 0.12302945, loss = 2.9203153, precision = 0.1997264, prediction/mean = 0.06554677, recall = 0.0996417
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5000: /tmp/tmpi_dwd2q1/model.ckpt-5000

Test set accuracy: 0.840



{'accuracy': 0.84011,
 'accuracy_baseline': 0.8769705,
 'auc': 0.5640794,
 'auc_precision_recall': 0.15830769,
 'average_loss': 2.9093788,
 'global_step': 5000,
 'label/mean': 0.12302945,
 'loss': 2.9203153,
 'precision': 0.1997264,
 'prediction/mean': 0.06554677,
 'recall': 0.0996417}

### Prediction

In [15]:
Tertarik = ['Tidak Tertarik', 'Tertarik']

In [16]:
def input_fn(features, batch_size=256):
    # Convert the inputs to a Dataset without labels.
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)

features = ['Jenis_Kelamin', 'Umur', 'SIM', 'Kode_Daerah', 'Sudah_Asuransi',
            'Umur_Kendaraan', 'Kendaraan_Rusak', 'Premi', 'Kanal_Penjualan',
            'Lama_Berlangganan']
predict = {}


print("Please type numeric values as prompted.")
for feature in features:
  valid = True
  while valid: 
    val = input(feature + ": ")
    if not val.isdigit(): valid = False

  predict[feature] = [float(val)]

predictions = classifier.predict(input_fn=lambda: input_fn(predict))
for pred_dict in predictions:
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print('Prediction is "{}" ({:.1f}%)'.format(
        Tertarik[class_id], 100 * probability))


KeyboardInterrupt: ignored

## Logistic Regression from scratch

In [76]:
# Logistic Regression
class LogitRegression() :
    def __init__( self, learning_rate, iterations ) :        
        self.learning_rate = learning_rate        
        self.iterations = iterations
          
    # Function for model training    
    def fit( self, X, Y ) :        
        # no_of_training_examples, no_of_features        
        self.m, self.n = X.shape        
        # weight initialization        
        self.W = np.zeros( self.n )        
        self.b = 0        
        self.X = X        
        self.Y = Y
          
        # gradient descent learning
                  
        for i in range( self.iterations ) :            
            self.update_weights()            
        return self
      
    # Helper function to update weights in gradient descent
      
    def update_weights( self ) :           
        A = 1 / ( 1 + np.exp( - ( self.X.dot( self.W ) + self.b ) ) )
          
        # calculate gradients        
        tmp = ( A - self.Y.T )        
        tmp = np.reshape( tmp, self.m )        
        dW = np.dot( self.X.T, tmp ) / self.m         
        db = np.sum( tmp ) / self.m 
          
        # update weights    
        self.W = self.W - self.learning_rate * dW    
        self.b = self.b - self.learning_rate * db
          
        return self
      
    # Hypothetical function  h( x ) 
      
    def predict( self, X ) :    
        Z = 1 / ( 1 + np.exp( - ( X.dot( self.W ) + self.b ) ) )        
        Y = np.where( Z > 0.5, 1, 0 )        
        return Y
  
  
# Driver code
  
def main() :
      
    # Model training    
    model = LogitRegression( learning_rate = 0.01, iterations = 1000 )
    model.fit(train, train_y)
      
    # Prediction on test set
    pred_y = model.predict( test )    
      
    # measure performance    
    correctly_classified = 0    
      
    # counter    
    count = 0    
    for count in range( np.size( pred_y ) ) :  
        
        if test_y[count] == pred_y[count] :            
            correctly_classified = correctly_classified + 1
              
        count = count + 1
          
    print( "Accuracy on test set by our model       :  ", ( 
      correctly_classified / count ) * 100 )

  
if __name__ == "__main__" :     
    main()

Accuracy on test set by our model       :   87.69705493398267


## Logistic Regression with Sklearn

In [79]:
import warnings
warnings.filterwarnings( "ignore" )
  
from sklearn.linear_model import LogisticRegression

In [81]:
def main() :

    model1 = LogisticRegression()    
    model1.fit(train, train_y)
        
    pred1_y = model1.predict( test )
        
    correctly_classified1 = 0
      
    # counter    
    count = 0    
    for count in range( np.size( pred1_y ) ) :  
        
        if test_y[count] == pred1_y[count] :            
            correctly_classified1 = correctly_classified1 + 1
              
        count = count + 1

    print( "Accuracy on test set by sklearn model   :  ", ( 
      correctly_classified1 / count ) * 100 )
  
if __name__ == "__main__" :     
    main()

Accuracy on test set by sklearn model   :   87.30451940636873


## Logistic Regression with Keras

In [67]:
# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [68]:
y_train = to_categorical(train_y)
y_test = to_categorical(test_y)

count_classes = y_test.shape[1]
print(count_classes)

2


In [71]:
model = Sequential()
model.add(Dense(500, activation='relu', input_dim=10))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [72]:
model.fit(train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ff6379692d0>

In [83]:
pred_test= model.predict(test)
scores2 = model.evaluate(test, y_test, verbose=0)
print('Accuracy on test data: {}% \n Error on test data: {}'.format(scores2[1] * 100, 1 - scores2[1]))    

Accuracy on test data: 87.69705295562744% 
 Error on test data: 0.12302947044372559


## Linear Regression Model

##  Linear Regression with Keras