### Imports  
*All the Python imports go here.*

In [1]:
import os, re, math, time, glob
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

np.random.seed(2019)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/galo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Ignoring warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

-------------------------


# Read data

## Training and Test 

In [3]:
df_model_training = pd.read_csv('../data/training.csv')
df_model_test = pd.read_csv('../data/test.csv')

In [4]:
df_model_training.category.value_counts(normalize=True)

polit       0.203806
health      0.185390
crime       0.170657
economi     0.169429
educ        0.132597
unemploy    0.082873
corrupt     0.055249
Name: category, dtype: float64

In [5]:
df_model_test.category.value_counts(normalize=True)

polit       0.217454
health      0.187411
economi     0.161660
crime       0.153076
educ        0.138770
unemploy    0.084406
corrupt     0.057225
Name: category, dtype: float64

# Prediction

Using the training and test dataset, we will predict the category of news.

### SVM Classifier

In [6]:
frames = [df_model_training, df_model_test]
df_model = pd.concat(frames)

In [7]:
# Create TFIDF matrix.
corpus = df_model['tokens-headline-stopwords-stemming']
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(corpus)
print(matrix.shape)

(2328, 4166)


In [8]:
# Split my data on training y test.
num_training = len(df_model_training)
X_train = matrix[:num_training,:]
X_test = matrix[num_training:,:]
y_train = df_model["category"].values[:num_training]
y_test = df_model['category'].values[num_training:]

In [9]:
# Create the SVM classifier
clf = SVC(probability=True, kernel='linear')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [10]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     corrupt       1.00      0.23      0.37        40
       crime       0.82      0.84      0.83       107
     economi       0.86      0.81      0.84       113
        educ       0.98      0.82      0.89        97
      health       0.80      0.87      0.83       131
       polit       0.62      0.82      0.71       152
    unemploy       0.98      0.80      0.88        59

   micro avg       0.80      0.80      0.80       699
   macro avg       0.86      0.74      0.76       699
weighted avg       0.82      0.80      0.79       699



### Any Traditional ML Classifier

### Any Neural Network Classifier