In [1]:
from __future__ import print_function
import collections
import os
import string
import sklearn
from sklearn import datasets
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
# explore the dataset:
from itertools import islice
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
import pandas as pd
import numpy as np

In [2]:
# <document,list of words>
documents = {}
# <cat,no.docs>
catDocs = {} 
# <cat,no.terms,frequency>
catTerms = {}
# document and its categories
dicCategory = {}
#data of the dataset
data = ''

In [3]:
def makeDataSet(path):
   dataSet=sklearn.datasets.load_files(path, description=None, categories=None, load_content=True, shuffle=True, encoding=None, random_state=0)
   data = dataSet.data
   return dataSet




In [4]:
# merge 2 dictionaries
def mergeDicts(dic1, dic2):
    input = [dic1, dic2]
    return sum((Counter(dict(x)) for x in input), Counter())

# count dictionary's words frequency
def makeFrequencyDic(words):
    wordFreq=[words.count(p) for p in words]
    return dict(zip(words,wordFreq))

# update frequency for categories
def updateCategories(categories,words):
    wordsFreq= makeFrequencyDic(words)
    for category in categories:
        if category in catTerms:
            frequencyDic=catTerms[category]
            mergedDic=mergeDicts(frequencyDic,wordsFreq)
            catTerms[category]=mergedDic
        else:
            catTerms[category]=wordsFreq
            
# sort frequency dic in descending order
def sortFreqDict(freqdict):
    sorted = [(freqdict[key], key) for key in freqdict]
    sorted.sort()
    sorted.reverse()
    return sorted

# "Return first n items of the iterable as a list"
def take(n, iterable):
    return list(islice(iterable, n))


In [5]:
def cleanData(dataSet):
   counter = 0
   for file in dataSet.filenames:
       data = dataSet.data[counter]
       category = os.path.basename(os.path.dirname(file))
       fileName = os.path.basename(file)
       documents[fileName] = word_tokenize(data.decode("utf-8"))
       counter = counter+1
       # <category, num of docs>
       if category in catDocs:
           catDocs[category] = catDocs[category] +1
       else:
           catDocs[category] = 1
           # <doc,list of categories>
       if fileName not in dicCategory:
           dicCategory[fileName] = []
           dicCategory[fileName].append(category)
       else:
           dicCategory[fileName].append(category)
   # stop words filtration ,lowercase and stemmig
   stop_words = stopwords.words('english')
   moreStopWords=['p', '+/-', '-/+']
   stop_words += set(string.punctuation)
   stop_words.extend(moreStopWords)
   #stem


   for key, value in documents.items():
       words = documents[key]
       ps = PorterStemmer()
       #filtered_sentence = [w for w in words if not w in stop_words]
       filtered_sentence = []
       for w in words:
           w = w.lower()
           if w not in stop_words:
               if len(w)>1:
                   try:
                       filtered_sentence.append(ps.stem(w))
                   except Exception as inst:
                       filtered_sentence.append(w)
       documents[key] = filtered_sentence
       updateCategories(dicCategory[key],filtered_sentence)


In [6]:
from IPython.display import display
def makeTables(dataSet):
    
    numCat = len(dataSet.target_names)
    # print number of categories
    print('Number of categories', numCat)
    # make categories and number of docs for each of them table
    sortedCategories=collections.OrderedDict(sorted(catDocs.items()))
    categoriesArr=[]
    for category, numFiles in sortedCategories.items():
        categoriesArr.append([category,numFiles])

    categorisDocumentsDF=pd.DataFrame(categoriesArr, columns=['Category', 'NumberOfFiles'])
    display(categorisDocumentsDF)


    # make frequencies table
    categoriesArr=[]
    lineArr=[]
    for category, freqDic in catTerms.items():
        lineArr=[]
        lineArr.append(category)
        freqDic=sortFreqDict(freqDic) #sort dic by frequency
        top10 = dict((val, key) for (key, val) in freqDic)
        # print(top10)
        top10 = dict(take(10, top10.items()))

        for word,freq in top10.items():
            termAndfreq=[word,freq]
            lineArr.append(termAndfreq)

        categoriesArr.append(lineArr)

    termsFreqDF=pd.DataFrame(categoriesArr, columns=['Category','term 1','term 2','term 3','term 4','term 5','term 6','term 7','term 8','term 9','term 10'])
    display(termsFreqDF)

In [7]:
dataSet = makeDataSet("C:\\Users\\Estif\\Downloads\\ohsumed-first-20000-docs.tar\\ohsumed-first-20000-docs"
            "\\training")
cleanData(dataSet)
makeTables(dataSet)

Number of categories 23


Unnamed: 0,Category,NumberOfFiles
0,C01,423
1,C02,158
2,C03,65
3,C04,1163
4,C05,283
5,C06,588
6,C07,100
7,C08,473
8,C09,125
9,C10,621


Unnamed: 0,Category,term 1,term 2,term 3,term 4,term 5,term 6,term 7,term 8,term 9,term 10
0,C19,"[patient, 568]","[diabet, 264]","[level, 155]","[thyroid, 148]","[studi, 146]","[less, 146]","[glucos, 136]","[diseas, 123]","[subject, 107]","[insulin, 102]"
1,C04,"[patient, 2886]","[cell, 1614]","[tumor, 1450]","[cancer, 1000]","[carcinoma, 904]","[case, 700]","[studi, 695]","[use, 619]","[treatment, 566]","[diseas, 550]"
2,C20,"[patient, 1275]","[infect, 601]","[cell, 552]","[diseas, 388]","[hiv, 360]","[studi, 345]","[human, 328]","[use, 310]","[immunodefici, 306]","[aid, 289]"
3,C23,"[patient, 4887]","[group, 1207]","[studi, 1043]","[use, 1019]","[less, 861]","[case, 830]","[diseas, 789]","[result, 781]","[effect, 710]","[year, 688]"
4,C08,"[patient, 1172]","[lung, 465]","[pulmonari, 420]","[group, 314]","[studi, 297]","[diseas, 289]","[less, 264]","[use, 250]","[increas, 247]","[respiratori, 234]"
5,C14,"[patient, 3831]","[arteri, 1273]","[group, 1096]","[coronari, 1053]","[less, 1039]","[ventricular, 898]","[studi, 876]","[heart, 874]","[pressur, 825]","[use, 818]"
6,C21,"[patient, 1083]","[injuri, 589]","[use, 375]","[alcohol, 346]","[studi, 330]","[result, 264]","[fractur, 245]","[case, 242]","[effect, 220]","[trauma, 217]"
7,C07,"[patient, 151]","[cell, 64]","[periodont, 59]","[treatment, 57]","[case, 55]","[use, 47]","[tumor, 47]","[diseas, 43]","[studi, 40]","[primari, 40]"
8,C02,"[patient, 287]","[infect, 274]","[viru, 180]","[human, 139]","[cell, 113]","[immunodefici, 101]","[diseas, 98]","[studi, 88]","[hiv, 87]","[case, 83]"
9,C10,"[patient, 1531]","[studi, 396]","[group, 371]","[use, 324]","[pain, 270]","[diseas, 264]","[result, 248]","[clinic, 245]","[less, 243]","[case, 243]"


In [70]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB

In [85]:
# data_train = dataSet
# data_test = makeDataSet("C:\\Users\\Estif\\Downloads\\ohsumed-first-20000-docs.tar\\ohsumed-first-20000-docs"
#             "\\test")
# 
# y_train = data_train.target
# y_test = data_test.target
# 
# # jj = list(documents.values())
# # mergedlist = []
# # for hh in jj:
# #     mergedlist.append(' '.join(hh))


TypeError: 'list' object is not callable

In [84]:
# print("Extracting features from training")
# vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
# X_train = vectorizer.fit_transform(mergedlist)


Extracting features from training
  (0, 12656)	1.0
  (1, 5791)	1.0
  (3, 20607)	1.0
  (4, 19117)	1.0
  (5, 1837)	1.0
  (6, 1782)	1.0
  (7, 17164)	1.0
  (8, 18359)	1.0
  (9, 6634)	1.0
  (10, 7932)	1.0
  (11, 21728)	1.0
  (12, 15962)	1.0
  (13, 21994)	1.0
  (14, 1840)	1.0
  (15, 17164)	1.0
  (16, 21506)	1.0
  (17, 5791)	1.0
  (19, 20607)	1.0
  (20, 19117)	1.0
  (21, 1837)	1.0
  (22, 1782)	1.0
  (23, 17294)	1.0
  (24, 8402)	1.0
  (25, 15962)	1.0
  (26, 4971)	1.0
  :	:
  (704770, 9467)	1.0
  (704771, 6016)	1.0
  (704772, 10811)	1.0
  (704773, 2151)	1.0
  (704774, 1615)	1.0
  (704775, 15486)	1.0
  (704776, 13085)	1.0
  (704777, 20835)	1.0
  (704778, 15552)	1.0
  (704779, 9467)	1.0
  (704780, 2200)	1.0
  (704781, 16272)	1.0
  (704782, 19161)	0.6602359700475633
  (704782, 16128)	0.7510582293373484
  (704783, 9561)	1.0
  (704784, 9467)	1.0
  (704785, 21903)	1.0
  (704786, 9467)	1.0
  (704787, 2200)	1.0
  (704788, 16272)	1.0
  (704789, 19441)	1.0
  (704790, 19161)	0.6602359700475633
  (704790, 