# Importing Libraries 

In [1]:
import pandas
import glob # used for finding dir with corresponding pattern
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
import pickle
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# Extracting Data and Cleaning Data

In [2]:
with open('news', 'r') as f:
    text = f.read() # read the file
    news = text.split("\n\n") # split news item
    
    # dict for counting each category and naming the files
    count = {'sport': 0, 'world': 0, "us": 0, "business": 0, "health": 0, "entertainment": 0, "sci_tech": 0}
    
    # create file of each news item and put it in data/'category' folder
    for news_item in news:
        lines = news_item.split("\n")
        file_to_write = open('data/' + lines[6] + '/' + str(count[lines[6]]) + '.txt', 'w+')
        count[lines[6]] = count[lines[6]] + 1
        file_to_write.write(news_item)
        file_to_write.close()

category_list = ["sport", "world", "us", "business", "health", "entertainment", "sci_tech"]
directory_list = ["data/sport/*.txt", "data/world/*.txt","data/us/*.txt","data/business/*.txt","data/health/*.txt","data/entertainment/*.txt","data/sci_tech/*.txt"]

text_files = list(map(lambda x: glob.glob(x), directory_list))
text_files = [item for sublist in text_files for item in sublist]

# create a list of news item (dict) with it's headline and description as data and category as flag
training_data = []

for t in text_files:
    f = open(t, 'r')
    f = f.read()
    t = f.split('\n')
    training_data.append({'data' : t[0] + ' ' + t[1], 'flag' : category_list.index(t[6])})

In [3]:
training_data = pandas.DataFrame(training_data, columns=['data', 'flag'])
training_data.to_csv("train_data.csv", sep=',', encoding='utf-8')
print(training_data.data.shape)

(32604,)


In [4]:
print(training_data)

                                                    data  flag
0      skinner, couture, grabner named rookie of year...     0
1      low-profile match provides tournament's bigges...     0
2      supporting cast could be difference for top-se...     0
3      lightning strike down caps; sharks win opener ...     0
4      power hitter alvaro quiros saves best for last...     0
...                                                  ...   ...
32599  charges dismissed against 2 broadcom executive...     6
32600  groupon president solomon leaving company: rep...     6
32601  the nook enters tablet territory the nook colo...     6
32602  bp oil spill offers clues on air pollution: st...     6
32603  acer enters the tablet market acer is selling ...     6

[32604 rows x 2 columns]


# Retrieving useful information from data 

In [5]:
# GET VECTOR COUNT
count_vect = CountVectorizer() # init CountVectorizer

X_train_counts = count_vect.fit_transform(training_data.data) # returns a document-term matrix

# SAVE WORD VECTOR
pickle.dump(count_vect.vocabulary_, open("count_vector.pkl","wb"))

In [6]:
# TRANSFORM WORD VECTOR TO TF IDF
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


# SAVE TF-IDF
pickle.dump(tfidf_transformer, open("tfidf.pkl","wb"))

# Applying SVM to the tf - idf of the corpus 

SVM (Support vector machines) are set of supervised learning methods for classification , regression and finding outliers  
In this case, we are using Linear Support vector machine.  

In [7]:
# SVM
clf_svm = svm.LinearSVC()

X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, training_data.flag, test_size=0.25, random_state=42)

clf_svm.fit(X_train_tfidf, training_data.flag)

pickle.dump(clf_svm, open("svm.pkl", "wb"))

In [8]:
predicted = clf_svm.predict(X_test)

result_svm = pandas.DataFrame( {'true_labels': y_test,'predicted_labels': predicted})

result_svm.to_csv('res_svm.csv', sep = ',')

print(result_svm.head())

       true_labels  predicted_labels
29157            5                 5
12299            1                 1
28306            5                 5
22716            3                 3
1398             0                 0


In [9]:
confusion_matrix(y_test,predicted)

array([[2032,    0,    2,    0,    0,    0,    0],
       [   0, 1556,    1,   15,    1,    2,    1],
       [   2,    5, 1162,   27,    3,    2,    6],
       [   2,    2,   14, 1227,    6,    5,   41],
       [   2,    4,   10,    3,  442,    5,    0],
       [   1,    0,    0,    0,    1,  842,    3],
       [   1,    3,    3,   18,    1,    2,  696]])

In [10]:
print(clf_svm.score(X_test,y_test))

0.976199239357134


# Multinomial Naive Bayes 

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [11]:
# Multinomial Naive Bayes

# divide the data set into test data and train data.
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, training_data.flag, test_size=0.25, random_state=42)

# Classifier is used to fit the data according to MNB algorithm and then we save the model
clf = MultinomialNB().fit(X_train, y_train)

# SAVE MODEL
pickle.dump(clf, open("nb_model.pkl", "wb"))

In [12]:
# To show model is working


category_list = ["sport", "world", "us", "business", "health", "entertainment", "sci_tech"]

# Write a normal news title
docs_new = "Messi joins other football team"
docs_new = [docs_new]

# Load the count_vector,tfidf and load the nb_model.pk 
loaded_vec = CountVectorizer(vocabulary=pickle.load(open("count_vector.pkl", "rb")))
loaded_tfidf = pickle.load(open("tfidf.pkl","rb"))
loaded_model = pickle.load(open("nb_model.pkl","rb"))

# Apply count vectorizer on news item then apply tfidf and then predict category using Multinomial NM
X_new_counts = loaded_vec.transform(docs_new)
X_new_tfidf = loaded_tfidf.transform(X_new_counts)
predicted = loaded_model.predict(X_new_tfidf)

print(category_list[predicted[0]])

sport


In [21]:
predicted = loaded_model.predict(X_test)
result_bayes = pandas.DataFrame( {'true_labels': y_test,'predicted_labels': predicted})
result_bayes.to_csv('res_bayes.csv', sep = ',')

In [22]:
confusion_matrix(y_test,predicted)

array([[2020,    6,    4,    4,    0,    0,    0],
       [  24, 1472,   20,   51,    0,    8,    1],
       [  98,  139,  831,  130,    0,    7,    2],
       [  62,   32,   51, 1110,    2,   11,   29],
       [  93,   52,   63,  106,  145,    5,    2],
       [ 246,   39,   28,   38,    0,  493,    3],
       [  96,   60,   57,  281,    2,    6,  222]])

In [23]:
print(clf.score(X_test,y_test))

0.7720525088946142


# Multilayer Perceptron 

Multilayer Perceptron belongs to a class of artificial neural network with feedforward.  
The MLPClassifier takes certain parameters. Here we explain some of the parameters that we have used in brief.

1) solver:- We need to provide some kind of optimization algorithm to our model. Limited-memory BFGS(lbfgs) is an advanced optimization algorithm which is one of the methods of quasi-Newton method for optimization. This method is better than gradient descent as it uses 2nd order partial derivative for optimization unlike gradient descent which uses 1st order partial derivative.  
2) hidden_layer_sizes :- Number of nueurons in each hidden layer. Ex: if hidden_layer_sizes=(1,2) , then we have two hidden layers with 1 and 2 neurons respectively.  
3) alpha :- penalizing parameter for the model.  

In [None]:
clf_neural = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)

X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, training_data.flag, test_size=0.25, random_state=42)

clf_neural.fit(X_train, y_train)

In [None]:
pickle.dump(clf_neural, open("softmax.pkl", "wb"))

In [18]:
# Predict the classes for test set.
predicted = clf_neural.predict(X_test)
result_softmax = pandas.DataFrame( {'true_labels': y_test,'predicted_labels': predicted})

# Store the dataframe into a csv file.
result_softmax.to_csv('res_softmax.csv', sep = ',')

# Print actual and predicted class for each data.
for predicted_item, result in zip(predicted, y_test):
    print(category_list[predicted_item], ' - ', category_list[result])


entertainment  -  entertainment
world  -  world
entertainment  -  entertainment
us  -  business
sport  -  sport
us  -  us
business  -  business
business  -  business
entertainment  -  entertainment
health  -  sci_tech
sport  -  sport
us  -  sport
world  -  world
world  -  world
us  -  sci_tech
health  -  health
health  -  sci_tech
sport  -  sport
us  -  us
sport  -  sport
business  -  business
sport  -  sport
sci_tech  -  sci_tech
us  -  us
business  -  us
health  -  entertainment
entertainment  -  entertainment
world  -  world
business  -  business
sport  -  sport
sport  -  sport
business  -  sci_tech
world  -  world
sport  -  us
world  -  world
entertainment  -  entertainment
sport  -  sport
world  -  world
world  -  world
world  -  world
sport  -  sport
sci_tech  -  sci_tech
sport  -  sport
us  -  world
world  -  business
entertainment  -  entertainment
sport  -  sport
world  -  world
world  -  world
us  -  us
sport  -  entertainment
sport  -  sport
health  -  health
world  -  us
he

business  -  business
us  -  us
world  -  world
world  -  world
us  -  us
us  -  us
entertainment  -  entertainment
world  -  world
world  -  world
world  -  world
world  -  world
sport  -  sport
business  -  business
health  -  health
world  -  us
health  -  health
business  -  sport
world  -  world
entertainment  -  us
business  -  business
sport  -  sport
business  -  world
world  -  sport
sci_tech  -  business
sport  -  sport
entertainment  -  entertainment
us  -  business
sci_tech  -  entertainment
sport  -  sport
us  -  business
world  -  world
health  -  health
world  -  world
entertainment  -  entertainment
sport  -  sport
sport  -  sport
world  -  world
sport  -  sport
entertainment  -  entertainment
entertainment  -  entertainment
sport  -  sport
sport  -  sport
sport  -  sport
sport  -  sport
health  -  health
business  -  business
world  -  world
sport  -  sport
entertainment  -  entertainment
sci_tech  -  sci_tech
business  -  sci_tech
business  -  entertainment
world  -  

business  -  sport
us  -  us
sport  -  sport
world  -  world
sport  -  sport
sport  -  sport
us  -  us
entertainment  -  entertainment
world  -  world
world  -  world
us  -  us
us  -  world
health  -  health
entertainment  -  entertainment
world  -  world
world  -  world
world  -  world
business  -  business
us  -  us
entertainment  -  entertainment
world  -  world
entertainment  -  entertainment
sci_tech  -  sci_tech
us  -  us
world  -  world
sport  -  sport
sport  -  sport
world  -  world
sport  -  us
business  -  business
business  -  business
business  -  business
world  -  world
world  -  world
us  -  world
health  -  health
business  -  business
business  -  business
world  -  world
sci_tech  -  sci_tech
us  -  sport
world  -  world
us  -  us
sport  -  sport
sci_tech  -  business
entertainment  -  entertainment
world  -  world
health  -  health
us  -  us
sport  -  sport
us  -  business
business  -  business
us  -  us
business  -  business
sport  -  sport
business  -  sci_tech
spo

sport  -  sport
us  -  us
sport  -  sport
sci_tech  -  business
world  -  world
world  -  entertainment
us  -  us
us  -  us
us  -  us
us  -  us
us  -  us
us  -  us
sport  -  sport
sci_tech  -  sci_tech
sport  -  sport
sport  -  sport
sport  -  sport
us  -  world
us  -  us
health  -  entertainment
world  -  world
sci_tech  -  sci_tech
us  -  us
sport  -  sport
world  -  world
sport  -  sport
business  -  business
us  -  us
world  -  world
world  -  world
business  -  business
sci_tech  -  sci_tech
sport  -  sport
us  -  us
sci_tech  -  us
sport  -  sport
world  -  world
sport  -  sport
entertainment  -  entertainment
business  -  world
health  -  health
business  -  sci_tech
sport  -  sport
business  -  world
sport  -  sport
world  -  us
sport  -  sport
sci_tech  -  sci_tech
sci_tech  -  business
sci_tech  -  business
entertainment  -  sport
sport  -  sport
world  -  world
us  -  us
sci_tech  -  sci_tech
sport  -  sport
sport  -  sport
entertainment  -  entertainment
sport  -  us
world 

business  -  business
entertainment  -  entertainment
sport  -  sport
health  -  business
sport  -  sport
world  -  world
sport  -  sport
sport  -  sport
sport  -  sport
world  -  world
business  -  us
health  -  health
sport  -  sport
sport  -  sport
us  -  us
entertainment  -  entertainment
entertainment  -  entertainment
world  -  world
sport  -  sport
sport  -  health
world  -  world
world  -  world
sport  -  sport
world  -  world
entertainment  -  world
health  -  health
world  -  world
business  -  sci_tech
sport  -  sport
sport  -  sport
world  -  world
sci_tech  -  sci_tech
world  -  world
business  -  business
business  -  business
entertainment  -  entertainment
sci_tech  -  sci_tech
entertainment  -  entertainment
business  -  business
sport  -  sport
business  -  business
sport  -  sport
world  -  world
business  -  entertainment
entertainment  -  entertainment
world  -  world
us  -  world
world  -  us
world  -  world
business  -  business
sport  -  sport
entertainment  -  

world  -  health
sport  -  sport
entertainment  -  entertainment
sport  -  sport
entertainment  -  entertainment
entertainment  -  entertainment
business  -  sci_tech
health  -  health
us  -  us
sport  -  sport
sport  -  sport
us  -  us
world  -  world
sci_tech  -  us
world  -  world
business  -  business
world  -  business
sport  -  sport
health  -  health
health  -  us
world  -  world
entertainment  -  entertainment
sport  -  us
us  -  us
health  -  health
sport  -  sport
world  -  world
sport  -  us
business  -  business
business  -  business
business  -  business
world  -  world
business  -  world
business  -  world
us  -  us
health  -  health
health  -  health
us  -  us
world  -  world
us  -  us
world  -  world
entertainment  -  entertainment
world  -  world
world  -  world
world  -  world
world  -  world
business  -  business
entertainment  -  entertainment
us  -  us
health  -  business
sci_tech  -  sci_tech
business  -  business
sport  -  sport
sport  -  sport
sport  -  sport
bu

world  -  world
sport  -  sport
entertainment  -  entertainment
sport  -  sport
business  -  us
sci_tech  -  sci_tech
world  -  world
sport  -  sport
us  -  us
sport  -  sport
world  -  health
sci_tech  -  sci_tech
business  -  sci_tech
world  -  entertainment
sport  -  sport
us  -  us
us  -  business
health  -  health
world  -  world
sport  -  sport
world  -  world
sport  -  health
us  -  us
sport  -  sport
world  -  us
sci_tech  -  business
business  -  business
world  -  health
us  -  sport
sport  -  sport
sport  -  sport
entertainment  -  entertainment
sport  -  sport
sport  -  sport
sport  -  sport
world  -  world
world  -  world
entertainment  -  entertainment
us  -  sci_tech
sport  -  sport
health  -  health
health  -  sci_tech
sport  -  sport
entertainment  -  entertainment
sci_tech  -  sci_tech
world  -  sport
sport  -  sport
world  -  world
world  -  world
sport  -  sport
sport  -  sport
business  -  entertainment
world  -  world
us  -  us
us  -  health
sport  -  sport
world 

us  -  us
business  -  business
us  -  us
world  -  world
business  -  sport
business  -  us
world  -  world
sport  -  sport
sport  -  world
business  -  entertainment
sport  -  sport
us  -  health
sport  -  sport
world  -  world
us  -  us
us  -  us
business  -  business
business  -  business
business  -  business
world  -  world
world  -  world
sport  -  sport
sci_tech  -  sci_tech
world  -  world
business  -  business
us  -  us
world  -  world
world  -  us
health  -  health
us  -  us
world  -  world
business  -  us
entertainment  -  entertainment
us  -  us
entertainment  -  entertainment
sport  -  sport
us  -  us
world  -  world
business  -  sci_tech
business  -  business
sci_tech  -  business
sci_tech  -  sci_tech
entertainment  -  entertainment
sport  -  us
world  -  world
us  -  us
sport  -  sport
sport  -  sport
entertainment  -  business
entertainment  -  entertainment
world  -  world
entertainment  -  entertainment
sci_tech  -  sport
sport  -  sport
business  -  business
busine

sci_tech  -  sci_tech
entertainment  -  entertainment
sport  -  sport
sport  -  sport
world  -  world
sci_tech  -  sci_tech
business  -  sci_tech
sport  -  sport
business  -  business
us  -  us
health  -  health
world  -  world
us  -  us
sport  -  sport
business  -  world
health  -  health
health  -  health
world  -  world
world  -  world
sport  -  sport
sport  -  sport
entertainment  -  entertainment
sport  -  sport
us  -  us
entertainment  -  entertainment
sport  -  sport
sci_tech  -  sci_tech
us  -  entertainment
sport  -  sport
sport  -  sport
us  -  business
sport  -  sport
world  -  world
sport  -  sport
entertainment  -  entertainment
sport  -  sport
world  -  world
sci_tech  -  sci_tech
sport  -  sport
us  -  sport
world  -  world
sci_tech  -  sci_tech
sport  -  sport
entertainment  -  business
sport  -  sport
sport  -  sport
business  -  business
us  -  us
business  -  business
sport  -  sport
sport  -  sport
health  -  health
entertainment  -  entertainment
business  -  busin

In [19]:
confusion_matrix(y_test,predicted)

array([[1936,   16,   28,   19,    3,   26,    6],
       [  14, 1389,   75,   62,   11,   18,    7],
       [  29,   60,  915,   93,   46,   33,   31],
       [  14,   49,   85, 1001,   21,   27,  100],
       [   7,   19,   39,   31,  347,   14,    9],
       [  46,   12,   30,   60,   15,  677,    7],
       [  29,   20,   51,  142,   18,    9,  455]])

In [20]:
# Print accuracy of the model
print(clf_neural.score(X_test,y_test))

0.8244387191755613
