In [42]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import nltk
from collections import Counter
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from itertools import chain

def load_dataset(csv_file):
    #import the file and convert it into a dataframe
    data = pd.read_csv(csv_file)
    return data

def unionise_train_test(train,test):
    train['Categorise'] = 'Training'
    test['Categorise'] = 'Test'
    unionised_data = pd.concat([train,test],ignore_index=True)
    display(unionised_data)
    return unionised_data
    

def split_words(data):
    res = data['article_words'].str.split(',').apply(pd.value_counts)
    res = res.fillna(0)
    #cols = [col for col in res.columns]
    #display(cols)
    return res

def get_word_list(data):
    #We will need to consider the most popular words. However this might mean it might not be applicable for some articles
    counter = pd.Series(np.concatenate([x.split(',') for x in data['article_words']])).value_counts()
    word_list = counter.index.tolist()
    return word_list

def get_popular_words(data):
    #We will need to consider the most popular words. However this might mean it might not be applicable for some articles
    counter = pd.Series(np.concatenate([x.split(',') for x in data['article_words']])).value_counts()[:10000]
    word_list = counter.index.tolist()
    return counter,word_list

def combine_with_target(data_left, data_right):

    merge_data = pd.merge(data_left, data_right, left_index=True, right_index=True)
    del merge_data['article_words']
    del merge_data['article_number']
    merge_data.rename(columns={"topic_x": "topic", "topic_y": "topic_target"},inplace = True)
    
    return merge_data

def split_training_set(unionised_set):
    train = unionised_set[unionised_set['Categorise'] == 'Training']
    test = unionised_set[unionised_set['Categorise'] == 'Test']
    del train['Categorise']
    del test['Categorise']
    return train, test
    
def accuracy(confusion_matrix):
    diagonal_sum = confusion_matrix.trace()
    sum_of_all_elements = confusion_matrix.sum()
    return diagonal_sum / sum_of_all_elements

#def implement_pca(x_set):
#    pca = PCA(.95)
#    pca.fit(x_set)
#    train_pca = pca.transform(train_img)
#    display(train_pca)

def min_max_normalisation(data,word_list):
    for i in range(len(word_list)):
        data[word_list[i]] = (data[word_list[i]] - data[word_list[i]].min())/(data[word_list[i]].max() - data[word_list[i]].min())
        
    return data
    



def build_model(x_train,y_train,x_test,y_test):
    clf = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=100, alpha=0.05,
                     solver='sgd', verbose=10,  random_state=21,activation = 'tanh')
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    display(cm)
    print("Accuracy of MLPClassifier : " + str(accuracy(cm)))
    
    pred_labels = list(set(y_test))
    print(classification_report(y_test,y_pred, target_names=pred_labels))

if __name__ == "__main__":
    ##Load up the training set and test set
    #training = load_dataset('training.csv')
    #test = load_dataset('test.csv')
    #Unionise the dataset
    #unionised_dataset = unionise_train_test(training,test)
    ###get all words from the unionised dataset
    #all_words = get_word_list(unionised_dataset)
    #word_count_data = split_words(unionised_dataset)
    #word_count_norm = min_max_normalisation(word_count_data,all_words)
    ##Prepare the training set##
    word_count, word_list = get_popular_words(training)
    merged_data = combine_with_target(word_count_data,unionised_dataset) 
    training_set,test_set = split_training_set(merged_data)


    y = training_set['topic_target']
    x = training_set[word_list]
    display(training_set)
    x_test = test_set[word_list]
    y_test = test_set['topic_target'] 
    
 
    build_model(x,y,x_test,y_test)
    

Unnamed: 0,dollar,won,bank,deal,clos,wednesday,cent,foreign,suppl,move,...,morandin,mandy,mondes,hundley,alfonz,bankoa,caja,cooperativ,matthey,topic_target
0,8.0,5.0,5.0,4.0,3.0,3.0,3.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FOREX MARKETS
1,2.0,0.0,1.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MONEY MARKETS
2,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SPORTS
3,4.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FOREX MARKETS
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,IRRELEVANT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9495,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DEFENCE
9496,2.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,IRRELEVANT
9497,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FOREX MARKETS
9498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,IRRELEVANT


Iteration 1, loss = 1.95390266
Iteration 2, loss = 1.29809898
Iteration 3, loss = 1.11016456
Iteration 4, loss = 0.99049937
Iteration 5, loss = 0.90420571
Iteration 6, loss = 0.84010865
Iteration 7, loss = 0.79031684
Iteration 8, loss = 0.75127514
Iteration 9, loss = 0.71961362
Iteration 10, loss = 0.69332726
Iteration 11, loss = 0.67023850
Iteration 12, loss = 0.65049385
Iteration 13, loss = 0.63221546
Iteration 14, loss = 0.61594373
Iteration 15, loss = 0.60105817
Iteration 16, loss = 0.58699779
Iteration 17, loss = 0.57363634
Iteration 18, loss = 0.56170467
Iteration 19, loss = 0.54987216
Iteration 20, loss = 0.53922810
Iteration 21, loss = 0.52889175
Iteration 22, loss = 0.51895346
Iteration 23, loss = 0.50941697
Iteration 24, loss = 0.49989967
Iteration 25, loss = 0.49162475
Iteration 26, loss = 0.48306071
Iteration 27, loss = 0.47504293
Iteration 28, loss = 0.46684866
Iteration 29, loss = 0.45934225
Iteration 30, loss = 0.45220950
Iteration 31, loss = 0.44485427
Iteration 32, los



array([[  2,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0],
       [  4,   2,   1,   0,   0,   0,   8,   0,   0,   0,   0],
       [  0,   0,   8,   0,   0,   0,   4,   0,   1,   0,   0],
       [  0,   0,   0,   2,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,  20,   0,   4,  24,   0,   0,   0],
       [  0,   0,   0,   0,   0,  10,   2,   0,   2,   0,   0],
       [  0,   1,   3,   1,   3,   4, 228,  20,   0,   2,   4],
       [  0,   0,   0,   0,  18,   0,   9,  42,   0,   0,   0],
       [  1,   0,   0,   0,   0,   1,   1,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   4,   0,   0,   3,   0],
       [  0,   0,   0,   0,   0,   0,   5,   0,   0,   0,  55]],
      dtype=int64)

Accuracy of MLPClassifier : 0.744
                                  precision    recall  f1-score   support

                      IRRELEVANT       0.29      0.67      0.40         3
                   MONEY MARKETS       0.67      0.13      0.22        15
BIOGRAPHIES PERSONALITIES PEOPLE       0.67      0.62      0.64        13
      ARTS CULTURE ENTERTAINMENT       0.67      1.00      0.80         2
          SCIENCE AND TECHNOLOGY       0.49      0.42      0.45        48
                  SHARE LISTINGS       0.67      0.71      0.69        14
                          SPORTS       0.86      0.86      0.86       266
                          HEALTH       0.49      0.61      0.54        69
                   FOREX MARKETS       0.00      0.00      0.00         3
                DOMESTIC MARKETS       0.60      0.43      0.50         7
                         DEFENCE       0.93      0.92      0.92        60

                        accuracy                           0.74       500
  