In [2]:
import math
import json
import numpy as np
from collections import Counter
import pandas as pd
import random
from random import shuffle
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import svm
from numpy import linalg 

In [2]:
def tfidf(articles):
    length = len(articles)  
    Word = {}
    bigdata = []
    for x in articles:
        for y in set(x):
            if y in Word:
                Word[y] += 1
            else:
                Word[y] = 1
                
            
    for x in articles: 
        dic = {}
        count =  Counter(x)
        for word in set(x):
            
            max_count = max(count.values())
            
            num = Word[word]
            
            idf = math.log(length/(1+num))
            
            tf = (0.5+(0.5*(count[word]/len(x))/max_count))
            
            dic[word] = idf*tf
            
        bigdata.append(dic)
        
    Data = pd.DataFrame(data=bigdata)
    Data = pd.DataFrame.as_matrix(Data.fillna(0))
    Data = np.matrix(Data)
    return Data

In [3]:
with open('../datas/bbc_preprocessed.json') as f:
    data=json.load(f)

articles = [x["content"] for x in data]
Data = tfidf(articles)

In [46]:
np.sum(Data,axis=0)

matrix([[  4.49437056,   4.49321471,   4.49325704, ...,   4.49485093,
          12.44217723,   4.49278261]])

In [4]:
def yconvert(y,num):
    Y = np.zeros((len(y),num))
    for i in range(len(y)):
        Y[i,y[i]] = 1
    return Y

In [5]:
cate = [x["category"] for x in data]
map = {'Technology':0,'Entertainment & Arts':1,'Business':2,'Health':3,'Science & Environment':4}
categ = [map[x] for x in cate]


random.seed(123)
sam = random.sample(range(Data.shape[0]), Data.shape[0]//2)

train = Data[sam,]
valid = np.delete(Data,sam,0)

train_categ = [categ[i] for i in range(len(categ)) if i in sam]
valid_categ = np.delete(categ, sam, 0)

In [6]:
category = yconvert(categ,5)
train_category = category[sam,]
valid_category = np.delete(category,sam,0)

In [7]:
#Softmax function
def softmax(x):
    sm = (np.exp(x)/np.sum(np.exp(x),axis = 1))
    return np.matrix(sm)

#Cross-entropy loss fucntion
def cross_entropy(prob, y, lam,w):
    loss = -np.sum(np.multiply(np.log(prob), y)) +  lam*np.sum(abs(w))
    return loss

#Batch gradient descent
def batch_gradient(x, y, prob, lam, w):
    grad = np.dot(x.T, (y - prob)) + lam*abs(w)
    return grad

#Stoch gradient descent
def stoch_gradient(x, y, prob, lam, w):
    ran = random.randint(1, x.shape[0])
    grad = np.dot(x[ran:(ran+49)%x.shape[0],].T, (y - prob)[ran:(ran+49)%x.shape[0],]) + lam*w
    return grad

def main(x, y, lam, alpha, e, stoch):
    w = np.zeros((x.shape[1],y.shape[1]))
    w = np.matrix(w)
    n = len(x[:,1])
    
    prob = softmax(x * w)
    loss = -(1/n) * cross_entropy(prob, y, lam, w)
    grad = -(1/n) * batch_gradient(x, y, prob, lam, w)
    w = w + (alpha * grad)
    j=0
    
    for i in range(1000):
        prob = softmax(x * w)
        
        loss0 = loss
        
        loss = -(1/n) * cross_entropy(prob, y, lam, w)
        
        if stoch:
            j= (j+1)%x.shape[0]
            grad = -(1/50) * stoch_gradient(x, y, prob, lam, w)
            
        else:
            grad = -(1/n) * batch_gradient(x, y, prob, lam, w)
        
        w = w - (alpha * grad)
        #print((abs(loss0-loss)), 0.05)
        if (abs(loss0-loss) < e):
            break
        
    return w

def prediction(x,w):
    
    probs = softmax(np.dot(x,w))
    preds = np.argmax(probs, axis = 1)
    return preds

def rate(x,w,y):
    pred = prediction(x,w)
    return np.sum(pred.T == y)/len(y)

In [8]:
#Cross-validation to choose lambda
def choose_lam(lam, Data, category, fold):
    best_lam = 0
    best_rate = 0
    datas = Data
    for l in lam:
        r = 0 
        for k in range(1,(datas.shape[0]//fold)+1):
            leave = range((k-1)*fold,(k-1)*fold+fold)
            aaa = main(np.delete(datas,leave,0), np.delete(category,leave,0), l, 0.1, False, 0.1)
            r = r + rate(datas[leave,], aaa, categ[(k-1)*1600:(k-1)*1600+1600])
            print (r)
        if r/(datas.shape[0]//fold) > best_rate:
            best_rate = r/9
            best_lam = l
    return best_rate, best_lam

In [9]:
lam = 0
W = main(train, train_category, lam, 0.1, 0.01, False)
r = rate(valid, W, valid_categ)

In [None]:
#Choose the number of principle components
def choose_k(Data):
    S= linalg.svd(Data, full_matrices=False, compute_uv= False) 
    summ = sum(S)
    for k in range(1,len(S)):
        if sum(S[:k])/summ > 0.99:
            break
    return k

In [15]:
pca = PCA(n_components = 100)
pca.fit(Data)

PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [16]:
newdata = np.matrix(pca.transform(Data))
newtrain = newdata[sam,]
newvalid = np.delete(newdata,sam,0)

In [47]:
model = svm.SVC(kernel='linear',C=10, gamma=100, decision_function_shape='ovo')
model.fit(newvalid, valid_categ)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma=100, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [49]:
#plt.plot(pca.explained_variance_, linewidth=2)
pred = model.predict(newtrain)
np.sum(pred == train_categ)/len(train_categ)
pred

array([0, 2, 2, ..., 2, 4, 0])