In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import json
from sklearn.feature_extraction.text import TfidfVectorizer

## Coarse Data Breakdown

In [9]:
twenty_data_coarse = pickle.load(open('data/20news/coarse/df.pkl', 'rb'))
twenty_data_coarse.head()

Unnamed: 0,sentence,label
0,from: (where's my thing)\nsubject: what car i...,rec
1,from: (guy kuo)\nsubject: si clock poll - fin...,comp
2,from: (thomas e willis)\nsubject: pb question...,comp
3,from: jgreen@amber (joe green)\nsubject: re: w...,comp
4,from: (jonathan mcdowell)\nsubject: re: shutt...,sci


In [12]:
with open('data/20news/coarse/seedwords.json') as fp:
    twenty_data_coarse_seed = json.load(fp)
twenty_data_coarse_seed

{'alt': ['atheism', 'atheists', 'religion', 'objective'],
 'comp': ['graphics', 'windows', 'scsi', 'mac'],
 'misc': ['sale', 'offer', 'shipping', 'forsale'],
 'rec': ['car', 'bike', 'game', 'team'],
 'sci': ['encryption', 'circuit', 'candida', 'space'],
 'talk': ['turkish', 'gun', 'jews', 'armenian'],
 'soc': ['church', 'jesus', 'christ', 'christians']}

In [14]:
nyt_data_coarse = pickle.load(open('data/nyt/coarse/df.pkl', 'rb'))
nyt_data_coarse.head()

Unnamed: 0,sentence,label
0,"nasa, in preparation for a spacewalk on saturd...",science
1,if professional pride and strong defiance can ...,sports
2,"admittedly, the language is reconstructed and ...",arts
3,"palermo, sicily — roberta vinci beat top-seede...",sports
4,the argentine soccer club san lorenzo complete...,sports


In [15]:
with open('data/nyt/coarse/seedwords.json') as fp:
    nyt_data_coarse_seed = json.load(fp)
nyt_data_coarse_seed

{'arts': ['music', 'orchestra', 'album', 'opera', 'ballet'],
 'business': ['companies', 'euro', 'economy', 'batteries', 'sales'],
 'science': ['space', 'researchers', 'scientists', 'research', 'science'],
 'sports': ['yankees', 'cup', 'league', 'basketball', 'golf'],
 'politics': ['republicans', 'senator', 'senate', 'democrats', 'election']}

In [22]:
docFreq_twenty = {}
docFreq_nyt = {}
twenty_sentences = twenty_data_coarse.sentence.values
nyt_sentences = nyt_data_coarse.sentence.values
#twenty_sentences[0], nyt_sentences[0]

In [26]:
for sentence in twenty_sentences:
    words = sentence.strip().split()
    temp_set = set(words)
    for c in temp_set:
        try:
            docFreq_twenty[c] += 1
        except:
            docFreq_twenty[c] = 1

for sentence in nyt_sentences:
    words = sentence.strip().split()
    temp_set = set(words)
    for c in temp_set:
        try:
            docFreq_nyt[c] += 1
        except:
            docFreq_nyt[c] = 1

In [28]:
inv_docFreq_twenty = {}
inv_docFreq_nyt = {}
twenty_N = len(twenty_data_coarse)
nyt_N = len(nyt_data_coarse)

In [31]:
for word in docFreq_twenty:
    inv_docFreq_twenty[word] = np.log(twenty_N / docFreq_twenty[word])
for word in docFreq_nyt:
    inv_docFreq_nyt[word] = np.log(nyt_N / docFreq_nyt[word])

In [10]:
twenty_data_fine = pickle.load(open('data/20news/fine/df.pkl', 'rb'))
twenty_data_fine.head()

Unnamed: 0,sentence,label
0,from: (where's my thing)\nsubject: what car i...,rec.autos
1,from: (guy kuo)\nsubject: si clock poll - fin...,comp.sys.mac.hardware
2,from: (thomas e willis)\nsubject: pb question...,comp.sys.mac.hardware
3,from: jgreen@amber (joe green)\nsubject: re: w...,comp.graphics
4,from: (jonathan mcdowell)\nsubject: re: shutt...,sci.space


In [13]:
with open('data/20news/fine/seedwords.json') as fp:
    twenty_data_fine_seed = json.load(fp)
twenty_data_fine_seed

{'alt.atheism': ['atheism', 'atheists', 'rushdie'],
 'comp.graphics': ['jpeg', 'gif', 'images', 'graphics'],
 'comp.os.ms-windows.misc': ['driver', 'microsoft', 'dos', 'windows'],
 'comp.sys.ibm.pc.hardware': ['scsi', 'ide', 'controller', 'drive'],
 'comp.sys.mac.hardware': ['apple', 'mac', 'centris', 'powerbook'],
 'comp.windows.x': ['motif', 'x11r5', 'xterm', 'window'],
 'misc.forsale': ['sale', 'offer', 'shipping', 'forsale'],
 'rec.autos': ['car', 'mustang'],
 'rec.motorcycles': ['bike', 'dod', 'bikes'],
 'rec.sport.baseball': ['baseball'],
 'rec.sport.hockey': ['hockey', 'nhl'],
 'sci.crypt': ['encryption', 'clipper', 'key', 'chip'],
 'sci.electronics': ['circuit', 'current', 'wire', 'voltage'],
 'sci.med': ['candida', 'msg', 'vitamin'],
 'sci.space': ['space', 'shuttle', 'orbit'],
 'soc.religion.christian': ['jesus', 'christ', 'church', 'bible'],
 'talk.politics.guns': ['weapons', 'gun', 'guns', 'firearms'],
 'talk.politics.mideast': ['armenian', 'israeli', 'turkish', 'arab'],
 '