Get List of top Bigrams from Training Set

In [1]:
import os

os.chdir('../data')

trainfolder = "ReutersC50\\C50train"  
testfolder = "ReutersC50\\C50test"
Names = [name for name in os.listdir(trainfolder)]
trainAuthorFolder = [trainfolder + '\\' + name + "\\" for name in Names]
testAuthorFolder = [testfolder + '\\' + name + "\\" for name in Names]

In [2]:
import glob

for author in trainAuthorFolder:
    read_files = glob.glob(author + "*.txt")

    with open("ReutersC50\\combined\\" + author.split('\\')[-2] + ".txt", "wb") as outfile:
        for f in read_files:
            with open(f, "rb") as infile:
                outfile.write(infile.read())

In [3]:
combinedGlob = glob.glob("ReutersC50\\combined\\" + '*.txt')

In [4]:
import re
import nltk
from operator import itemgetter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [5]:
topBigrams = []
ps = PorterStemmer()

for files in combinedGlob:
    f = open(files)
    raw = f.read()

    #keep only alphanumeric and spaces
    pattern = re.compile('([^\s\w]|_)+')
    raw = pattern.sub('', raw).lower()

    #split into words "tokens"
    tokens = nltk.word_tokenize(raw)

    #remove stop words
    stop_words = set(stopwords.words('english')) 
    tokens = [w for w in tokens if not w in stop_words] 
    tokens = [ps.stem(w) for w in tokens]
    
    #bigrams
    bgs = nltk.bigrams(tokens)
    fdist = nltk.FreqDist(bgs)
    
    sortBigram = sorted(fdist.items(), key = itemgetter(1), reverse = True)
    topBigrams.append(sortBigram[0:100])

In [6]:
topBigrams = list(set().union(*topBigrams))

In [7]:
len(topBigrams)

4617

Now let's get a data frame where each row is a document, sans stopwords and stems

In [8]:
train_frame = []

for author in trainAuthorFolder:
    folderGlob = glob.glob(author + "*.txt")
    
    for files in folderGlob:
        f = open(files)
        raw = f.read()

        #keep only alphanumeric and spaces
        pattern = re.compile('([^\s\w]|_)+')
        raw = pattern.sub('', raw).lower()

        #split into words "tokens"
        tokens = nltk.word_tokenize(raw)

        #remove stop words
        stop_words = set(stopwords.words('english')) 
        tokens = [w for w in tokens if not w in stop_words] 
        tokens = [ps.stem(w) for w in tokens]

        train_frame.append((tokens, author))

In [9]:
import pandas as pd

In [10]:
dfTrain = pd.DataFrame(columns=['document', 'author', 'test/train'])

for count, document in enumerate(train_frame):
    dfTrain.loc[count] = [' '.join(document[0]), document[1].split('\\')[2], document[1].split('\\')[1]]

In [11]:
dfTrain

Unnamed: 0,document,author,test/train
0,internet may overflow new technolog crime cybe...,AaronPressman,C50train
1,us postal servic announc wednesday plan boost ...,AaronPressman,C50train
2,elementari school student access internet lear...,AaronPressman,C50train
3,influenti internet organis back away propos dr...,AaronPressman,C50train
4,influenti internet organis back away propos dr...,AaronPressman,C50train
5,group lead trademark specialist plan releas re...,AaronPressman,C50train
6,compani california sell book consum canada web...,AaronPressman,C50train
7,us law govern trillion dollar futur market cou...,AaronPressman,C50train
8,suprem court justic wednesday sharpli question...,AaronPressman,C50train
9,internet continu grow leap bound year onlin se...,AaronPressman,C50train


That was training, we need test too:

In [12]:
test_frame = []

for author in testAuthorFolder:
    folderGlob = glob.glob(author + "*.txt")
    
    for files in folderGlob:
        f = open(files)
        raw = f.read()

        #keep only alphanumeric and spaces
        pattern = re.compile('([^\s\w]|_)+')
        raw = pattern.sub('', raw).lower()

        #split into words "tokens"
        tokens = nltk.word_tokenize(raw)

        #remove stop words
        stop_words = set(stopwords.words('english')) 
        tokens = [w for w in tokens if not w in stop_words] 
        tokens = [ps.stem(w) for w in tokens]

        test_frame.append((tokens, author))

dfTest = pd.DataFrame(columns=['document', 'author', 'test/train'])

for count, document in enumerate(test_frame):
    dfTest.loc[count] = [' '.join(document[0]), document[1].split('\\')[2], document[1].split('\\')[1]]

In [13]:
dfTest

Unnamed: 0,document,author,test/train
0,us senat tuesday sharpli critic new secur exch...,AaronPressman,C50test
1,two member congress criticis feder reserv thur...,AaronPressman,C50test
2,commut stuck traffic leesburg pike northern vi...,AaronPressman,C50test
3,broad coalit corpor went capitol hill tuesday ...,AaronPressman,C50test
4,internet new product come go blink eye time sa...,AaronPressman,C50test
5,legisl continu debat wednesday one difficult i...,AaronPressman,C50test
6,top feder regul thursday urg bank care issu cr...,AaronPressman,C50test
7,congress reviv debat encrypt export polici wee...,AaronPressman,C50test
8,congress reviv debat encrypt export polici wee...,AaronPressman,C50test
9,feder bank regul begun prod us financi institu...,AaronPressman,C50test


Combine the two for our full sample:

In [14]:
dfFull = pd.concat([dfTrain, dfTest])

In [15]:
dfFull

Unnamed: 0,document,author,test/train
0,internet may overflow new technolog crime cybe...,AaronPressman,C50train
1,us postal servic announc wednesday plan boost ...,AaronPressman,C50train
2,elementari school student access internet lear...,AaronPressman,C50train
3,influenti internet organis back away propos dr...,AaronPressman,C50train
4,influenti internet organis back away propos dr...,AaronPressman,C50train
5,group lead trademark specialist plan releas re...,AaronPressman,C50train
6,compani california sell book consum canada web...,AaronPressman,C50train
7,us law govern trillion dollar futur market cou...,AaronPressman,C50train
8,suprem court justic wednesday sharpli question...,AaronPressman,C50train
9,internet continu grow leap bound year onlin se...,AaronPressman,C50train


Now we want to count the times each common bigram appears in each document:

In [16]:
topBigrams

[(('palm', 'beach'), 8),
 (('chicken', 'feet'), 16),
 ((u'transact', u'valu'), 15),
 (('25', 'percent'), 8),
 (('new', u'legislatur'), 9),
 (('earlier', 'year'), 10),
 ((u'de', 'eaux'), 13),
 (('inc', 'said'), 6),
 (('north', 'west'), 14),
 ((u'involv', u'vehicl'), 7),
 (('oil', u'pipelin'), 10),
 (('said', 'china'), 29),
 (('ford', 'motor'), 45),
 (('newsroom', '42224230003'), 15),
 ((u'industri', u'sourc'), 19),
 ((u'properti', 'fund'), 7),
 ((u'justic', u'minist'), 8),
 ((u'newspap', 'said'), 32),
 (('london', 'metal'), 7),
 ((u'onlin', u'bank'), 10),
 (('40', 'percent'), 8),
 (('tuckey', 'said'), 15),
 (('guerrilla', u'movement'), 6),
 (('rule', 'law'), 8),
 (('852', '2843'), 16),
 ((u'sourc', 'close'), 8),
 (('first', 'half'), 9),
 (('said', 'analyst'), 18),
 (('last', 'month'), 20),
 ((u'statist', u'committe'), 7),
 (('british', u'newspap'), 9),
 ((u'china', u'administr'), 10),
 ((u'electron', 'group'), 14),
 (('guotai', u'secur'), 8),
 ((u'point', 'close'), 15),
 (('said', u'rec

In [17]:
' '.join(topBigrams[0][0])

'palm beach'

In [18]:
stringBigram = [' '.join(bigram[0]) for bigram in topBigrams]

In [19]:
stringBigram

['palm beach',
 'chicken feet',
 u'transact valu',
 '25 percent',
 u'new legislatur',
 'earlier year',
 u'de eaux',
 'inc said',
 'north west',
 u'involv vehicl',
 u'oil pipelin',
 'said china',
 'ford motor',
 'newsroom 42224230003',
 u'industri sourc',
 u'properti fund',
 u'justic minist',
 u'newspap said',
 'london metal',
 u'onlin bank',
 '40 percent',
 'tuckey said',
 u'guerrilla movement',
 'rule law',
 '852 2843',
 u'sourc close',
 'first half',
 'said analyst',
 'last month',
 u'statist committe',
 u'british newspap',
 u'china administr',
 u'electron group',
 u'guotai secur',
 u'point close',
 u'said receiv',
 u'tonn year',
 u'minist qian',
 u'told reuter',
 u'foreign ministri',
 u'reuter interview',
 u'feder credit',
 'second quarter',
 'said zhang',
 u'media leisur',
 u'gener manag',
 u'defens space',
 u'oil execut',
 'said stewart',
 u'five pound',
 u'oil pipelin',
 u'system inc',
 u'local telephon',
 'liu xiaobo',
 u'unit state',
 'jiang zemin',
 u'said refer',
 u'state pla

In [20]:
for bigram in stringBigram:
    dfTrain[bigram] = dfTrain['document'].str.count(bigram)

for bigram in stringBigram:
    dfTest[bigram] = dfTest['document'].str.count(bigram)    

In [21]:
dfTrain

Unnamed: 0,document,author,test/train,palm beach,chicken feet,transact valu,25 percent,new legislatur,earlier year,de eaux,...,remain independ,trade profit,csx buy,commerci court,british insur,ti liang,lynch analyst,475000 czech,govern decis,flag carrier
0,internet may overflow new technolog crime cybe...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,us postal servic announc wednesday plan boost ...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,elementari school student access internet lear...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,influenti internet organis back away propos dr...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,influenti internet organis back away propos dr...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,group lead trademark specialist plan releas re...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,compani california sell book consum canada web...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,us law govern trillion dollar futur market cou...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,suprem court justic wednesday sharpli question...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,internet continu grow leap bound year onlin se...,AaronPressman,C50train,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
trainBigramCount = dfTrain.iloc[:,3:]
#trainBigramCount = trainBigramCount.div(trainBigramCount.sum(axis=1), axis=0)
#trainBigramCount = trainBigramCount.dropna()
trainBigramCount

Unnamed: 0,palm beach,chicken feet,transact valu,25 percent,new legislatur,earlier year,de eaux,inc said,north west,involv vehicl,...,remain independ,trade profit,csx buy,commerci court,british insur,ti liang,lynch analyst,475000 czech,govern decis,flag carrier
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
testBigramCount = dfTest.iloc[:,3:]
#testBigramCount = testBigramCount.div(testBigramCount.sum(axis=1), axis=0)
#testBigramCount = testBigramCount.dropna()
testBigramCount

Unnamed: 0,palm beach,chicken feet,transact valu,25 percent,new legislatur,earlier year,de eaux,inc said,north west,involv vehicl,...,remain independ,trade profit,csx buy,commerci court,british insur,ti liang,lynch analyst,475000 czech,govern decis,flag carrier
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
train_document_frame = dfTrain.iloc[:,1:3]
train_document_frame

Unnamed: 0,author,test/train
0,AaronPressman,C50train
1,AaronPressman,C50train
2,AaronPressman,C50train
3,AaronPressman,C50train
4,AaronPressman,C50train
5,AaronPressman,C50train
6,AaronPressman,C50train
7,AaronPressman,C50train
8,AaronPressman,C50train
9,AaronPressman,C50train


In [25]:
test_document_frame = dfTest.iloc[:,1:3]
test_document_frame

Unnamed: 0,author,test/train
0,AaronPressman,C50test
1,AaronPressman,C50test
2,AaronPressman,C50test
3,AaronPressman,C50test
4,AaronPressman,C50test
5,AaronPressman,C50test
6,AaronPressman,C50test
7,AaronPressman,C50test
8,AaronPressman,C50test
9,AaronPressman,C50test


In [26]:
os.chdir("ReutersC50")
print os.getcwd()

C:\Users\chenson\Documents\GitHub\STA380\data\ReutersC50


In [27]:
test_document_frame.to_csv("test_documents3.csv")
train_document_frame.to_csv("train_documents3.csv")

testBigramCount.to_csv("test_bigrams3.csv")
trainBigramCount.to_csv("train_bigrams3.csv")