 # Fitting the neural network 

In [12]:
#Imports
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import classification_report,confusion_matrix

In [2]:
def prepare_data(df):
    "Rerformates data so it is appropriate for Tensorflow DNNC"
    x = df.drop(['lang'], axis=1)
    x.columns = ['trigram_'+str(col) for col in list(range(len(x.columns)))]
    y = df['lang']
    y = y.map({"eng": 0, "deu": 1, "spa": 2, "fra": 3, "por": 4, "ita": 5})
    return (x,y)

In [3]:
def get_data(feat_type):
    "Gets the training, valid and test data bases for a specific feature type"
    train = pd.read_csv("ANN_features/train_{}.csv".format(feat_type),index_col=0)
    valid = pd.read_csv("ANN_features/valid_{}.csv".format(feat_type),index_col=0)
    
    train_red = train[0:50000] #Reduce number of records for testing purposes 
    valid_red = valid[0:5000]
    (train_x,train_y) = prepare_data(train_red)
    (valid_x,valid_y) = prepare_data(valid_red)
    return (train_x,train_y), (valid_x,valid_y)

In [4]:
(train_x,train_y), (valid_x,valid_y) = get_data('50')
print(len(train_x),len(valid_x))
train_x.head()

50000 5000


Unnamed: 0,trigram_0,trigram_1,trigram_2,trigram_3,trigram_4,trigram_5,trigram_6,trigram_7,trigram_8,trigram_9,...,trigram_187,trigram_188,trigram_189,trigram_190,trigram_191,trigram_192,trigram_193,trigram_194,trigram_195,trigram_196
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,1,0,0,2,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Input functions 
def train_input_fn(features, labels, batch_size =100):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
    # Return the dataset.
    return dataset

def eval_input_fn(features, labels, batch_size=100):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset


 #TensorFlow (2016) An Example of a DNNClassifier for the Iris dataset. [Source code]. WWW.tensorflow.org
    

In [10]:
pred_y = []
for p in predictions:
    pred_y.append(p['class_ids'][0])

In [15]:
print(classification_report(valid_y,pred_y,digits=4))

              precision    recall  f1-score   support

           0     0.9783    0.9539    0.9659       802
           1     0.9883    0.9747    0.9814       868
           2     0.8477    0.8550    0.8514       814
           3     0.9285    0.9439    0.9361       784
           4     0.9079    0.9079    0.9079       858
           5     0.9244    0.9371    0.9307       874

   micro avg     0.9290    0.9290    0.9290      5000
   macro avg     0.9292    0.9287    0.9289      5000
weighted avg     0.9295    0.9290    0.9292      5000



In [17]:
print(confusion_matrix(valid_y,pred_y))

[[765   4  10   9   7   7]
 [  4 846   5  11   1   1]
 [  5   1 696  19  55  38]
 [  4   3  28 740   2   7]
 [  2   0  57   6 779  14]
 [  2   2  25  12  14 819]]
