Get List of top Bigrams from Training Set

In [1]:
import os

os.chdir('../data')

trainfolder = "ReutersC50\\C50train"  
testfolder = "ReutersC50\\C50test"
Names = [name for name in os.listdir(trainfolder)]
trainAuthorFolder = [trainfolder + '\\' + name + "\\" for name in Names]
testAuthorFolder = [testfolder + '\\' + name + "\\" for name in Names]

In [2]:
import glob

for author in trainAuthorFolder:
    read_files = glob.glob(author + "*.txt")

    with open("ReutersC50\\combined\\" + author.split('\\')[-2] + ".txt", "wb") as outfile:
        for f in read_files:
            with open(f, "rb") as infile:
                outfile.write(infile.read())

In [3]:
combinedGlob = glob.glob("ReutersC50\\combined\\" + '*.txt')

In [4]:
import re
import nltk
from operator import itemgetter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [5]:
topBigrams = []
ps = PorterStemmer()

for files in combinedGlob:
    f = open(files)
    raw = f.read()

    #keep only alphanumeric and spaces
    pattern = re.compile('([^\s\w]|_)+')
    raw = pattern.sub('', raw).lower()

    #split into words "tokens"
    tokens = nltk.word_tokenize(raw)

    #remove stop words
    stop_words = set(stopwords.words('english')) 
    tokens = [w for w in tokens if not w in stop_words] 
    tokens = [ps.stem(w) for w in tokens]
    
    #bigrams
    bgs = nltk.bigrams(tokens)
    fdist = nltk.FreqDist(bgs)
    
    sortBigram = sorted(fdist.items(), key = itemgetter(1), reverse = True)
    topBigrams.append(sortBigram[0:10])

In [6]:
topBigrams = list(set().union(*topBigrams))

In [7]:
len(topBigrams)

479

Now let's get a data frame where each row is a document, sans stopwords and stems

In [8]:
train_frame = []

for author in trainAuthorFolder:
    folderGlob = glob.glob(author + "*.txt")
    
    for files in folderGlob:
        f = open(files)
        raw = f.read()

        #keep only alphanumeric and spaces
        pattern = re.compile('([^\s\w]|_)+')
        raw = pattern.sub('', raw).lower()

        #split into words "tokens"
        tokens = nltk.word_tokenize(raw)

        #remove stop words
        stop_words = set(stopwords.words('english')) 
        tokens = [w for w in tokens if not w in stop_words] 
        tokens = [ps.stem(w) for w in tokens]

        train_frame.append((tokens, author))

In [9]:
import pandas as pd

In [10]:
dfTrain = pd.DataFrame(columns=['document', 'author', 'test/train'])

for count, document in enumerate(train_frame):
    dfTrain.loc[count] = [' '.join(document[0]), document[1].split('\\')[2], document[1].split('\\')[1]]

In [11]:
dfTrain

Unnamed: 0,document,author,test/train
0,internet may overflow new technolog crime cybe...,AaronPressman,C50train
1,us postal servic announc wednesday plan boost ...,AaronPressman,C50train
2,elementari school student access internet lear...,AaronPressman,C50train
3,influenti internet organis back away propos dr...,AaronPressman,C50train
4,influenti internet organis back away propos dr...,AaronPressman,C50train
5,group lead trademark specialist plan releas re...,AaronPressman,C50train
6,compani california sell book consum canada web...,AaronPressman,C50train
7,us law govern trillion dollar futur market cou...,AaronPressman,C50train
8,suprem court justic wednesday sharpli question...,AaronPressman,C50train
9,internet continu grow leap bound year onlin se...,AaronPressman,C50train


That was training, we need test too:

In [12]:
test_frame = []

for author in testAuthorFolder:
    folderGlob = glob.glob(author + "*.txt")
    
    for files in folderGlob:
        f = open(files)
        raw = f.read()

        #keep only alphanumeric and spaces
        pattern = re.compile('([^\s\w]|_)+')
        raw = pattern.sub('', raw).lower()

        #split into words "tokens"
        tokens = nltk.word_tokenize(raw)

        #remove stop words
        stop_words = set(stopwords.words('english')) 
        tokens = [w for w in tokens if not w in stop_words] 
        tokens = [ps.stem(w) for w in tokens]

        test_frame.append((tokens, author))

dfTest = pd.DataFrame(columns=['document', 'author', 'test/train'])

for count, document in enumerate(test_frame):
    dfTest.loc[count] = [' '.join(document[0]), document[1].split('\\')[2], document[1].split('\\')[1]]

In [13]:
dfTest

Unnamed: 0,document,author,test/train
0,us senat tuesday sharpli critic new secur exch...,AaronPressman,C50test
1,two member congress criticis feder reserv thur...,AaronPressman,C50test
2,commut stuck traffic leesburg pike northern vi...,AaronPressman,C50test
3,broad coalit corpor went capitol hill tuesday ...,AaronPressman,C50test
4,internet new product come go blink eye time sa...,AaronPressman,C50test
5,legisl continu debat wednesday one difficult i...,AaronPressman,C50test
6,top feder regul thursday urg bank care issu cr...,AaronPressman,C50test
7,congress reviv debat encrypt export polici wee...,AaronPressman,C50test
8,congress reviv debat encrypt export polici wee...,AaronPressman,C50test
9,feder bank regul begun prod us financi institu...,AaronPressman,C50test


Combine the two for our full sample:

In [14]:
dfFull = pd.concat([dfTrain, dfTest])

In [15]:
dfFull

Unnamed: 0,document,author,test/train
0,internet may overflow new technolog crime cybe...,AaronPressman,C50train
1,us postal servic announc wednesday plan boost ...,AaronPressman,C50train
2,elementari school student access internet lear...,AaronPressman,C50train
3,influenti internet organis back away propos dr...,AaronPressman,C50train
4,influenti internet organis back away propos dr...,AaronPressman,C50train
5,group lead trademark specialist plan releas re...,AaronPressman,C50train
6,compani california sell book consum canada web...,AaronPressman,C50train
7,us law govern trillion dollar futur market cou...,AaronPressman,C50train
8,suprem court justic wednesday sharpli question...,AaronPressman,C50train
9,internet continu grow leap bound year onlin se...,AaronPressman,C50train


Now we want to count the times each common bigram appears in each document:

In [16]:
topBigrams

[(('big', 'three'), 27),
 (('state', u'regul'), 40),
 ((u'analyst', 'said'), 29),
 (('russian', 'oil'), 27),
 (('ford', 'motor'), 45),
 ((u'select', u'committe'), 39),
 (('michael', u'andrea'), 27),
 ((u'jardin', u'fleme'), 44),
 ((u'newspap', 'said'), 32),
 ((u'juli', '1'), 27),
 ((u'financ', u'ministri'), 41),
 ((u'execut', 'said'), 22),
 (('phone', u'compani'), 21),
 (('human', u'right'), 19),
 (('told', u'reuter'), 44),
 ((u'analyst', 'said'), 60),
 ((u'analyst', 'said'), 26),
 (('pay', u'televis'), 21),
 (('microsoft', 'corp'), 32),
 (('told', u'reuter'), 32),
 ((u'unit', u'state'), 32),
 ((u'assembl', 'plant'), 53),
 ((u'genet', u'test'), 24),
 (('scottish', u'amic'), 48),
 ((u'chemic', u'busi'), 24),
 (('air', u'franc'), 90),
 (('fund', u'manag'), 77),
 (('21', '90'), 38),
 (('last', 'year'), 15),
 (('billion', u'pound'), 29),
 (('last', 'week'), 15),
 (('told', u'reuter'), 37),
 ((u'analyst', 'said'), 53),
 (('gold', u'price'), 27),
 ((u'compani', 'said'), 22),
 (('air', u'fran

In [17]:
' '.join(topBigrams[0][0])

'big three'

In [18]:
stringBigram = [' '.join(bigram[0]) for bigram in topBigrams]

In [19]:
stringBigram

['big three',
 u'state regul',
 u'analyst said',
 'russian oil',
 'ford motor',
 u'select committe',
 u'michael andrea',
 u'jardin fleme',
 u'newspap said',
 u'juli 1',
 u'financ ministri',
 u'execut said',
 u'phone compani',
 u'human right',
 u'told reuter',
 u'analyst said',
 u'analyst said',
 u'pay televis',
 'microsoft corp',
 u'told reuter',
 u'unit state',
 u'assembl plant',
 u'genet test',
 u'scottish amic',
 u'chemic busi',
 u'air franc',
 u'fund manag',
 '21 90',
 'last year',
 u'billion pound',
 'last week',
 u'told reuter',
 u'analyst said',
 u'gold price',
 u'compani said',
 u'air franc',
 u'motor corp',
 u'told reuter',
 u'provision legislatur',
 u'industri sourc',
 u'rail franchis',
 'british telecom',
 'third quarter',
 'first half',
 u'industri expert',
 u'sun microsystem',
 u'gener motor',
 u'analyst said',
 'silver king',
 u'select committe',
 'newsdesk 3124088787',
 u'premier leagu',
 'one analyst',
 u'dwayn andrea',
 'said one',
 u'governor chri',
 u'foreign bank',


In [20]:
for bigram in stringBigram:
    dfTrain[bigram] = dfTrain['document'].str.count(bigram)

for bigram in stringBigram:
    dfTest[bigram] = dfTest['document'].str.count(bigram)    

In [21]:
dfTrain

Unnamed: 0,document,author,test/train,big three,state regul,analyst said,russian oil,ford motor,select committe,michael andrea,...,half year,barrick gold,british aerospac,trillion yen,171 542,ivori coast,china said,silicon valley,world trade,vice presid
0,internet may overflow new technolog crime cybe...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,us postal servic announc wednesday plan boost ...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,elementari school student access internet lear...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,influenti internet organis back away propos dr...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,influenti internet organis back away propos dr...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,group lead trademark specialist plan releas re...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,compani california sell book consum canada web...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,us law govern trillion dollar futur market cou...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,suprem court justic wednesday sharpli question...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,internet continu grow leap bound year onlin se...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
trainBigramCount = dfTrain.iloc[:,3:]
#trainBigramCount = trainBigramCount.div(trainBigramCount.sum(axis=1), axis=0)
#trainBigramCount = trainBigramCount.dropna()
trainBigramCount

Unnamed: 0,big three,state regul,analyst said,russian oil,ford motor,select committe,michael andrea,jardin fleme,newspap said,juli 1,...,half year,barrick gold,british aerospac,trillion yen,171 542,ivori coast,china said,silicon valley,world trade,vice presid
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
testBigramCount = dfTest.iloc[:,3:]
#testBigramCount = testBigramCount.div(testBigramCount.sum(axis=1), axis=0)
#testBigramCount = testBigramCount.dropna()
testBigramCount

Unnamed: 0,big three,state regul,analyst said,russian oil,ford motor,select committe,michael andrea,jardin fleme,newspap said,juli 1,...,half year,barrick gold,british aerospac,trillion yen,171 542,ivori coast,china said,silicon valley,world trade,vice presid
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
train_document_frame = dfTrain.iloc[:,1:3]
train_document_frame

Unnamed: 0,author,test/train
0,AaronPressman,C50train
1,AaronPressman,C50train
2,AaronPressman,C50train
3,AaronPressman,C50train
4,AaronPressman,C50train
5,AaronPressman,C50train
6,AaronPressman,C50train
7,AaronPressman,C50train
8,AaronPressman,C50train
9,AaronPressman,C50train


In [25]:
test_document_frame = dfTest.iloc[:,1:3]
test_document_frame

Unnamed: 0,author,test/train
0,AaronPressman,C50test
1,AaronPressman,C50test
2,AaronPressman,C50test
3,AaronPressman,C50test
4,AaronPressman,C50test
5,AaronPressman,C50test
6,AaronPressman,C50test
7,AaronPressman,C50test
8,AaronPressman,C50test
9,AaronPressman,C50test


In [29]:
os.chdir("ReutersC50")
print os.getcwd()

WindowsError: [Error 2] The system cannot find the file specified: 'ReutersC50'

In [34]:
test_document_frame.to_csv("test_documents.csv")
train_document_frame.to_csv("train_documents.csv")

testBigramCount.to_csv("test_bigrams.csv")
trainBigramCount.to_csv("train_bigrams.csv")

C:\Users\chenson\Documents\GitHub\STA380\data\ReutersC50
