In [1]:
import pandas as pd
import numpy as np
import sys, os
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from nltk.stem import *
from nltk.stem.porter import *

def readData():
    news_df = pd.read_csv("uci-news-aggregator.csv")
    return news_df

def removeStopWords(x):
    stemmer = PorterStemmer()
    stopSet = set(stopwords.words('english'))

    for i in range(len(x)):
        wordList = x[i].split(" ")
        cleanLine = [stemmer.stem(word.lower()) for word in wordList if word not in stopSet]
        x[i] = ' '.join(cleanLine)
    
    return x
  
def splitData(news_df):
    x = news_df['TITLE'].values
    y = news_df['CATEGORY'].values
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=42)
    
    return x_train, x_test, y_train, y_test

In [4]:
def getProbabilityMethodAccuracy(x_train, x_test, y_train, y_test):
    categories = ['t' ,'e', 'm', 'b']
    
    # Get words of each category
    categorywise_words = {}
    categorywise_words['t'] = set()
    categorywise_words['e'] = set()
    categorywise_words['m'] = set()
    categorywise_words['b'] = set()
    
    # Stores the number of occurences of each word in that particular category
    word_count = {}
    word_count['t'] = {}
    word_count['e'] = {}
    word_count['m'] = {}
    word_count['b'] = {}
    n = len(x_train)
    for i in range(n):
        headline = x_train[i].lower().split()
        for word in headline:
            categorywise_words[y_train[i]].add(word)
            cur = word_count[y_train[i]].get(word, 0)
            word_count[y_train[i]][word] = cur + 1
    
    # Number of unique words in each category 
    category_count = {}
    category_count['t'] = len(categorywise_words['t'])
    category_count['e'] = len(categorywise_words['e'])
    category_count['m'] = len(categorywise_words['m'])
    category_count['b'] = len(categorywise_words['b'])
    
    
    y_actual = []
    n = len(x_test)
    for i in range(n):
        headline = x_test[i].lower().split()
        probabilities_of_words = []
        for word in headline:
            probabilities_of_categories = []
            # Get probability of each category. The assumption is if a word occurs more frequently in a category, that category is more likely when that word occurs
            for category in categories:
                _ = 1.0 * word_count[category].get(word, 0) / (1 + word_count['t'].get(word, 0) + \
                                                               word_count['e'].get(word, 0) + \
                                                               word_count['m'].get(word, 0) + \
                                                               word_count['b'].get(word, 0))
                probabilities_of_categories.append((category, _))
            # Take the highest probability category
            _ = max(probabilities_of_categories, key = lambda item:item[1])
            probabilities_of_words.append(_)
            # For each headline, let the word with the strongest affinity for a category decide the category for the entire headline
        _ = max(probabilities_of_words, key = lambda item:item[1])
        y_actual.append(_[0])
    
    y_test = list(y_test)
    return accuracy_score(y_actual, y_test)*100


In [5]:
def main():
    news_df = readData()
    x_train, x_test, y_train, y_test = splitData(news_df)
    print "the probabilistic model", getProbabilityMethodAccuracy(x_train, x_test, y_train, y_test)

if __name__ == '__main__':
    main()

the probabilistic model 91.08470242886227
