Get List of top Bigrams from Training Set

In [1]:
import os

os.chdir('../data')

trainfolder = "ReutersC50\\C50train"  
testfolder = "ReutersC50\\C50test"
Names = [name for name in os.listdir(trainfolder)]
trainAuthorFolder = [trainfolder + '\\' + name + "\\" for name in Names]
testAuthorFolder = [testfolder + '\\' + name + "\\" for name in Names]

In [2]:
import glob

for author in trainAuthorFolder:
    read_files = glob.glob(author + "*.txt")

    with open("ReutersC50\\combined\\" + author.split('\\')[-2] + ".txt", "wb") as outfile:
        for f in read_files:
            with open(f, "rb") as infile:
                outfile.write(infile.read())

In [3]:
combinedGlob = glob.glob("ReutersC50\\combined\\" + '*.txt')

In [4]:
import re
import nltk
from operator import itemgetter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [5]:
topBigrams = []
ps = PorterStemmer()

for files in combinedGlob:
    f = open(files)
    raw = f.read()

    #keep only alphanumeric and spaces
    pattern = re.compile('([^\s\w]|_)+')
    raw = pattern.sub('', raw).lower()

    #split into words "tokens"
    tokens = nltk.word_tokenize(raw)

    #remove stop words
    stop_words = set(stopwords.words('english')) 
    tokens = [w for w in tokens if not w in stop_words] 
    tokens = [ps.stem(w) for w in tokens]
    
    #bigrams
    bgs = nltk.bigrams(tokens)
    fdist = nltk.FreqDist(bgs)
    
    sortBigram = sorted(fdist.items(), key = itemgetter(1), reverse = True)
    topBigrams.append(sortBigram[0:10])

In [6]:
topBigrams = list(set().union(*topBigrams))

In [7]:
len(topBigrams)

479

Now let's get a data frame where each row is a document, sans stopwords and stems

In [None]:
train_frame = []

for author in trainAuthorFolder:
    folderGlob = glob.glob(author + "*.txt")
    
    for files in folderGlob:
        f = open(files)
        raw = f.read()

        #keep only alphanumeric and spaces
        pattern = re.compile('([^\s\w]|_)+')
        raw = pattern.sub('', raw).lower()

        #split into words "tokens"
        tokens = nltk.word_tokenize(raw)

        #remove stop words
        stop_words = set(stopwords.words('english')) 
        tokens = [w for w in tokens if not w in stop_words] 
        tokens = [ps.stem(w) for w in tokens]

        train_frame.append((tokens, author))

In [None]:
import pandas as pd

In [None]:
dfTrain = pd.DataFrame(columns=['document', 'author', 'test/train'])

for count, document in enumerate(train_frame):
    dfTrain.loc[count] = [' '.join(document[0]), document[1].split('\\')[2], document[1].split('\\')[1]]

In [None]:
dfTrain

That was training, we need test too:

In [None]:
test_frame = []

for author in testAuthorFolder:
    folderGlob = glob.glob(author + "*.txt")
    
    for files in folderGlob:
        f = open(files)
        raw = f.read()

        #keep only alphanumeric and spaces
        pattern = re.compile('([^\s\w]|_)+')
        raw = pattern.sub('', raw).lower()

        #split into words "tokens"
        tokens = nltk.word_tokenize(raw)

        #remove stop words
        stop_words = set(stopwords.words('english')) 
        tokens = [w for w in tokens if not w in stop_words] 
        tokens = [ps.stem(w) for w in tokens]

        test_frame.append((tokens, author))

dfTest = pd.DataFrame(columns=['document', 'author', 'test/train'])

for count, document in enumerate(test_frame):
    dfTest.loc[count] = [' '.join(document[0]), document[1].split('\\')[2], document[1].split('\\')[1]]

In [None]:
dfTest

Combine the two for our full sample:

In [None]:
dfFull = pd.concat([dfTrain, dfTest])

In [None]:
dfFull

Now we want to count the times each common bigram appears in each document:

In [None]:
topBigrams

In [None]:
' '.join(topBigrams[0][0])

In [None]:
stringBigram = [' '.join(bigram[0]) for bigram in topBigrams]

In [None]:
stringBigram

In [None]:
for bigram in stringBigram:
    dfTrain[bigram] = dfTrain['document'].str.count(bigram)

for bigram in stringBigram:
    dfTest[bigram] = dfTest['document'].str.count(bigram)    

In [None]:
dfTrain

In [None]:
trainBigramCount = dfTrain.iloc[:,3:]
#trainBigramCount = trainBigramCount.div(trainBigramCount.sum(axis=1), axis=0)
#trainBigramCount = trainBigramCount.dropna()
trainBigramCount

In [None]:
testBigramCount = dfTest.iloc[:,3:]
#testBigramCount = testBigramCount.div(testBigramCount.sum(axis=1), axis=0)
#testBigramCount = testBigramCount.dropna()
testBigramCount

In [None]:
train_document_frame = dfTrain.iloc[:,1:3]
train_document_frame

In [None]:
test_document_frame = dfTest.iloc[:,1:3]
test_document_frame

In [None]:
os.chdir("ReutersC50")
print os.getcwd()

In [None]:
test_document_frame.to_csv("test_documents.csv")
train_document_frame.to_csv("train_documents.csv")

testBigramCount.to_csv("test_bigrams.csv")
trainBigramCount.to_csv("train_bigrams.csv")