# Fitting the final neural network 


In [2]:
#imports
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import classification_report,confusion_matrix
from sklearn import preprocessing

In [5]:
train = pd.read_csv("ANN_features/train_50.csv",index_col=0)
train.columns

Index([' Ma', ' To', ' a ', ' an', ' au', ' be', ' ca', ' ch', ' co', ' da',
       ...
       'to ', 'ue ', 'un ', 'und', 'ur ', 'us ', 've ', 'you', 'ão ', 'lang'],
      dtype='object', length=198)

### Preparing data

In [3]:
def prepare_data(df):
    "Reformates data so it is appropriate for Tensorflow DNNC"
    x = df.drop(['lang'], axis=1)
    x.columns = ['trigram_'+str(col) for col in list(range(len(x.columns)))]
    y = df['lang']
    y = y.map({"eng": 0, "deu": 1, "spa": 2, "fra": 3, "por": 4, "ita": 5})
    return (x,y)

def get_data(feat_type):
    "Gets the training, valid and test data bases for a specific feature type"
    min_max_scaler = preprocessing.MinMaxScaler()
    
    train = pd.read_csv("ANN_features/train_{}.csv".format(feat_type),index_col=0)
    train_norm = min_max_scaler.fit_transform(train.drop('lang', axis=1))
    train_norm = pd.DataFrame(train_norm)
    train_norm['lang'] = train['lang']
    train_norm.columns = train.columns
    
    
    valid = pd.read_csv("ANN_features/valid_{}.csv".format(feat_type),index_col=0)
    valid_norm = min_max_scaler.transform(valid.drop('lang', axis=1))
    valid_norm = pd.DataFrame(valid_norm)
    valid_norm['lang'] = valid['lang']
    valid_norm.columns = valid.columns
    
    
    
    test = pd.read_csv("ANN_features/test_{}.csv".format(feat_type),index_col=0)
    test_norm = min_max_scaler.transform(test.drop('lang', axis=1))
    test_norm = pd.DataFrame(test_norm)
    test_norm['lang'] = test['lang']
    test_norm.columns = test.columns
    
    #Use all data
    
    (train_x,train_y) = prepare_data(train)
    (valid_x,valid_y) = prepare_data(valid)
    (test_x,test_y) = prepare_data(test)
    return (train_x,train_y), (valid_x,valid_y), (test_x,test_y)

In [4]:
#Input functions 
def train_input_fn(features, labels, batch_size =100):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
    # Return the dataset.
    return dataset

def eval_input_fn(features, labels, batch_size=100):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

#TensorFlow (2016) An Example of a DNNClassifier for the Iris dataset. [Source code]. WWW.tensorflow.org

In [5]:
def fit_model(hidden, steps):
    
    # Feature columns describe how to use the input.
    my_feature_columns = []
    for key in train_x.keys():
        my_feature_columns.append(tf.feature_column.numeric_column(key=key))
        
    my_checkpointing_config = tf.estimator.RunConfig(
    save_checkpoints_secs = 30*60,  # Save checkpoints every 20 minutes.
    keep_checkpoint_max = 2,       # Retain the 10 most recent checkpoints.
    )

    "Fits a DNNC with the desired features and stores validation results "
    # Build a DNN.
    classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 196 nodes each.
    hidden_units=hidden,
    # 6 languages.
    n_classes=6,
    config=my_checkpointing_config,
    optimizer=tf.train.ProximalAdagradOptimizer(
      learning_rate=0.1,
      l1_regularization_strength=0.001
    ))
    
    # Train the Model.
    classifier.train(
    input_fn=lambda:train_input_fn(train_x, train_y),
    steps=steps)
    
    #Get predictions of test values
    predictions = list(classifier.predict(input_fn=lambda:eval_input_fn(test_x,labels=None)))

    pred_y = []
    for p in predictions:
        pred_y.append(p['class_ids'][0])
        
    return pred_y

# Fitting model


In [7]:
(train_x,train_y), (valid_x,valid_y),(test_x,test_y) = get_data('200')
print(len(train_x),len(valid_x),len(test_x))
train_x.head()

  return self.partial_fit(X, y)


210000 60000 30000


Unnamed: 0,trigram_0,trigram_1,trigram_2,trigram_3,trigram_4,trigram_5,trigram_6,trigram_7,trigram_8,trigram_9,...,trigram_667,trigram_668,trigram_669,trigram_670,trigram_671,trigram_672,trigram_673,trigram_674,trigram_675,trigram_676
0,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#Feature: 200 hidden layer: [339] steps:1000
pred_y = fit_model([509],1000)
print(classification_report(test_y,pred_y))
print(confusion_matrix(test_y,pred_y))

INFO:tensorflow:Using config: {'_model_dir': '/var/folders/s2/82vv6ll16mn27w7gcdbccp3m0000gn/T/tmp5itwjqn8', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 1800, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 2, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x125f77f28>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Grap

In [9]:
print(classification_report(test_y,pred_y))
print(confusion_matrix(test_y,pred_y))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5094
           1       1.00      1.00      1.00      5071
           2       0.96      0.97      0.96      5004
           3       0.99      0.98      0.99      4944
           4       0.98      0.96      0.97      4960
           5       0.98      0.98      0.98      4927

   micro avg       0.98      0.98      0.98     30000
   macro avg       0.98      0.98      0.98     30000
weighted avg       0.98      0.98      0.98     30000

[[5058    4    9    7    6   10]
 [  10 5053    1    5    0    2]
 [  11    3 4867   14   72   37]
 [  22    5   17 4861   13   26]
 [   4    1  150    7 4772   26]
 [   6    3   42    9   15 4852]]


In [10]:
test_results = pd.DataFrame()
test_results['ann_test_result'] = pred_y
print(len(pred_y))
test_results.head()

30000


Unnamed: 0,ann_test_result
0,0
1,2
2,0
3,0
4,5


In [11]:
test_results.to_csv('ANN_features/ann_test_results_final.csv')

 # References 
 TensorFlow (2016) An Example of a DNNClassifier for the Iris dataset. [Source code]. WWW.tensorflow.org