In [1]:
import pandas as pd
df=pd.read_csv('nf_complete.csv')

# Pre-processing text

In [2]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [3]:
text = " ".join(review for review in df.abstract)
print ("There are {} words in the combination of all abstracts.".format(len(text)))


There are 274779 words in the combination of all abstracts.


In [4]:
from urllib.request import urlopen
from random import randint

def wordListSum(wordList):
    sum = 0
    for word, value in wordList.items():
        sum += value
    return sum

def retrieveRandomWord(wordList):
    randIndex = randint(1, wordListSum(wordList))
    for word, value in wordList.items():
        randIndex -= value
        if randIndex <= 0:
            return word

def buildWordDict(text):
    # Remove newlines and quotes
    text = text.replace('\n', ' ');
    text = text.replace('"', '');

    # Make sure punctuation marks are treated as their own "words,"
    # so that they will be included in the Markov chain
    punctuation = [',','.',';',':']
    for symbol in punctuation:
        text = text.replace(symbol, ' {} '.format(symbol));

    words = text.split(' ')
    # Filter out empty words
    words = [word for word in words if word != '']

    wordDict = {}
    for i in range(1, len(words)):
        if words[i-1] not in wordDict:
                # Create a new dictionary for this word
            wordDict[words[i-1]] = {}
        if words[i] not in wordDict[words[i-1]]:
            wordDict[words[i-1]][words[i]] = 0
        wordDict[words[i-1]][words[i]] += 1
    return wordDict

wordDict = buildWordDict(text)

#Generate a Markov chain of length 100
length = 100
chain = ['Vietnam']
for i in range(0, length):
    newWord = retrieveRandomWord(wordDict[chain[-1]])
    chain.append(newWord)

print(' '.join(chain))

Vietnam War 2003 . Furthermore I examine how , diplomacy as well as against three stages by authorities to that simulates a broad assertion that this dissertation investigates the region trying and the varying degrees of methodological tools to disadvantage a norm protecting states’ territorial threats , the fact predated his powers take in territorial conquest are believed that are evaluated , political objectives , such an awareness of foreign policy debate surrounding significant influence of citizenship and lead in Afghanistan beginning of southern towns actively . e . S . Documents from 1946-2003 . I use what conditions under which


In [5]:
def getFirstSentenceContaining(ngram, text):
    #print(ngram)
    sentences = text.upper().split(". ")
    for sentence in sentences: 
        if ngram in sentence:
            return sentence+'\n'
    return ""


print(getFirstSentenceContaining('I', text))




CIVIL-MILITARY RELATIONS ARE FREQUENTLY STUDIED AS IF THEY OPERATE ON TWO DISTINCT LEVELS OF ANALYSIS



In [6]:
text

'Civil-military relations are frequently studied as if they operate on two distinct levels of analysis. Some scholars emphasize an internal game concerned with leadership survival and regime transition.  Others focus on an external dimension of international conflict and war-fighting.  This project explores linkages across these two levels of analysis. I find that the armed forces\' role in a state\'s internal politics affects the development of its military institutions, sometimes in ways counterproductive to the state\'s international relations. Among this study\'s most provocative claims is that those states most prone to war, are often those least prepared to fight.\n        At the project\'s core is a theory about how states develop different military institutions, and how those institutions affect their propensities to fight and win wars on the inter-state battlefield. Drawing on insights from the new-institutionalism and building-on extant principal-agent approaches, I reframe t

In [7]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter

def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence

def cleanInput(content):
    content = content.upper()
    content = re.sub('\n', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)


content = str(text)

ngrams = getNgrams(content, 3)
print(ngrams)

Counter({'THE UNITED STATES': 53, 'I ARGUE THAT': 48, 'AS WELL AS': 37, 'THE COLD WAR': 26, 'OF THE WAR': 26, 'MORE LIKELY TO': 18, 'THE USE OF': 15, 'WORLD WAR II': 14, 'A NUMBER OF': 14, 'THE ROLE OF': 14, 'THE NATURE OF': 13, 'NATURE OF THE': 13, 'CONDITIONS UNDER WHICH': 13, 'END OF THE': 13, 'ARGUE THAT THE': 12, 'IN THE INTERNATIONAL': 12, 'THE INTERNATIONAL SYSTEM': 12, 'THE LIKELIHOOD OF': 12, 'THE IMPACT OF': 12, 'THIS DISSERTATION EXAMINES': 12, 'THE DEVELOPMENT OF': 11, 'THE GREAT WAR': 11, 'THE END OF': 11, 'THE RELATIONSHIP BETWEEN': 11, 'A FUNCTION OF': 11, 'I FIND THAT': 10, 'WELL AS THE': 10, 'OF THE MILITARY': 10, 'IN INTERNATIONAL POLITICS': 10, 'UNDERSTANDING OF THE': 10, 'AS A RESULT': 10, 'THE CONTEXT OF': 10, 'OF THE COLD': 10, 'A THEORY OF': 10, 'UNITED STATES AND': 10, 'I TEST MY': 9, 'THE FIRST WORLD': 9, 'FIRST WORLD WAR': 9, 'THE CONDITIONS UNDER': 9, 'THE SOVIET UNION': 9, 'ANALYSIS OF THE': 9, 'THE HISTORY OF': 9, 'IN ORDER TO': 9, 'USE OF FORCE': 9, 'FOCUS

In [8]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter

def isCommon(ngram):
    commonWords = ['THE', 'BE', 'AND', 'OF', 'A', 'IN', 'TO', 'HAVE', 'IT', 'I', 'THAT', 'FOR', 'YOU', 'HE', 'WITH', 'ON', 'DO', 'SAY', 'THIS', 'THEY', 'IS', 'AN', 'AT', 'BUT', 'WE', 'HIS', 'FROM', 'THAT', 'NOT', 'BY', 'SHE', 'OR', 'AS', 'WHAT', 'GO', 'THEIR', 'CAN', 'WHO', 'GET', 'IF', 'WOULD', 'HER', 'ALL', 'MY', 'MAKE', 'ABOUT', 'KNOW', 'WILL', 'AS', 'UP', 'ONE', 'TIME', 'HAS', 'BEEN', 'THERE', 'YEAR', 'SO', 'THINK', 'WHEN', 'WHICH', 'THEM', 'SOME', 'ME', 'PEOPLE', 'TAKE', 'OUT', 'INTO', 'JUST', 'SEE', 'HIM', 'YOUR', 'COME', 'COULD', 'NOW', 'THAN', 'LIKE', 'OTHER', 'HOW', 'THEN', 'ITS', 'OUR', 'TWO', 'MORE', 'THESE', 'WANT', 'WAY', 'LOOK', 'FIRST', 'ALSO', 'NEW', 'BECAUSE', 'DAY', 'MORE', 'USE', 'NO', 'MAN', 'FIND', 'HERE', 'THING', 'GIVE', 'MANY', 'WELL']
    for word in ngram:
        if word in commonWords:
            return True
    return False

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        if not isCommon(content[i:i+n]):
            output.append(content[i:i+n])
    return output

ngrams = getNgrams(content, 3)
print(ngrams)


Counter({'WORLD WAR II': 14, 'OVERT COLLECTIVE CHALLENGES': 7, 'COLD WAR ARMY': 6, 'LEADERS PROJECT POWER': 6, "THIRD PARTY'S DECISION": 4, 'GUATEMALAN NATIONAL POLICE': 4, 'NUCLEAR NONPROLIFERATION REGIME': 4, 'INTERNATIONAL RELATIONS SCHOLARS': 3, 'SETTLE TERRITORIAL DISPUTES': 3, 'ARMS CONTROL AGREEMENTS': 3, 'THIRD PARTY STATES': 3, '2ND VIETNAM WAR': 3, 'AMERICAN FOREIGN POLICY': 3, 'RELATIONSHIP BETWEEN POWER': 3, 'SECOND WORLD WAR': 3, 'DURING WORLD WAR': 3, 'HISTORICAL CASE STUDIES': 3, 'NUCLEAR WEAPONS ACQUISITION': 3, 'UNDERMINE OVERT COLLECTIVE': 3, 'PRECEDE AND/OR SUPPORT': 3, 'AND/OR SUPPORT SUCH': 3, 'SUPPORT SUCH BEHAVIOR': 3, 'CULPABLE LEADERS ARE': 3, 'BETWEEN KURDISH LEADERS': 3, 'SHARED SOCIAL IDENTITIES': 3, 'SOCIAL IDENTITIES ARE': 3, 'CIVIL-MILITARY RELATIONS ARE': 2, 'DIFFERENT INSTITUTIONAL SETTINGS': 2, 'EGYPTIAN-ISRAELI PEACE TREATY': 2, 'FOREIGN POLICY MAKING': 2, 'NORM PROTECTING STATES': 2, 'PROTECTING STATES TERRITORIAL': 2, 'STATES TERRITORIAL SOVEREIGNTY

In [9]:
def getFirstSentenceContaining(ngram, content):
    #print(ngram)
    sentences = content.upper().split(". ")
    for sentence in sentences: 
        if ngram in sentence:
            return sentence+'\n'
    return ""


print(getFirstSentenceContaining('SINO-JAPANESE WAR 1894-1895', content))
print(getFirstSentenceContaining('2ND VIETNAM WAR', content))
print(getFirstSentenceContaining('COLD WAR ARMY', content))
print(getFirstSentenceContaining('WORLD WAR II', content))
print(getFirstSentenceContaining('ARMS CONTROL AGREEMENTS', content))



   THE HISTORIOGRAPHY ON THE 2ND VIETNAM WAR HAS FOCUSED MOSTLY ON THE AMERICAN SIDE, WHILE THE ‘OTHER SIDE,’ ESPECIALLY FOR THE EARLY VIETNAM WAR, 1964-1966, HAS NOT ATTRACTED MUCH ATTENTION

COLD WAR ARMY DURING THE PERIOD 1949 AND 1953 BY EXAMINING HOW SENIOR ARMY LEADERS WERE ABLE TO FUNDAMENTALLY BROADEN THE INSTITUTION’S INTELLECTUAL AND HISTORICAL FRAMEWORK OF “PREPAREDNESS” TO DESIGN A BLUEPRINT FOR A NEW TYPE OF GROUND FORCE THAT WOULD BE MORE ADEPT TO MEET THE CHALLENGES OF THE NEW NATURE OF WAR IMPOSED BY THE COLD WAR

 I ARGUE THAT A NORM PROTECTING STATES’ TERRITORIAL SOVEREIGNTY IS ONLY ENTRENCHED AFTER WORLD WAR II, ALTHOUGH IT CAN BE TRACED AT LEAST AS FAR BACK AS THE FOUNDING OF THE LEAGUE OF NATIONS

 IN EACH CASE I USE RIGOROUS ANALYSIS ON ORIGINAL DATA TO EXPLAIN THE WHY, WHEN, AND HOW OF THEIR DECISIONS ON THE BOMB, AS WELL AS OF THEIR DECISIONS ON RELATED ISSUES SUCH AS WHETHER TO BUILD UP NUCLEAR TECHNOLOGY, TO SEEK NUCLEAR SECURITY GUARANTEES, AND TO SIGN INTE

In [10]:
from urllib.request import urlopen
from random import randint

def wordListSum(wordList):
    sum = 0
    for word, value in wordList.items():
        sum += value
    return sum

def retrieveRandomWord(wordList):
    randIndex = randint(1, wordListSum(wordList))
    for word, value in wordList.items():
        randIndex -= value
        if randIndex <= 0:
            return word

def buildWordDict(text):
    # Remove newlines and quotes
    text = text.replace('\n', ' ');
    text = text.replace('"', '');

    # Make sure punctuation marks are treated as their own "words,"
    # so that they will be included in the Markov chain
    punctuation = [',','.',';',':']
    for symbol in punctuation:
        text = text.replace(symbol, ' {} '.format(symbol));

    words = text.split(' ')
    # Filter out empty words
    words = [word for word in words if word != '']

    wordDict = {}
    for i in range(1, len(words)):
        if words[i-1] not in wordDict:
                # Create a new dictionary for this word
            wordDict[words[i-1]] = {}
        if words[i] not in wordDict[words[i-1]]:
            wordDict[words[i-1]][words[i]] = 0
        wordDict[words[i-1]][words[i]] += 1
    return wordDict

wordDict = buildWordDict(text)

#Generate a Markov chain of length 100
length = 100
chain = ['I']
for i in range(0, length):
    newWord = retrieveRandomWord(wordDict[chain[-1]])
    chain.append(newWord)

print(' '.join(chain))

I also intra-state conflict against their countervailing purposes of the agrarian reform . Rosato answers this context , some groups lacking with volunteer or lead China' s opportunity costs , violence has not conceptualized as “the military exercises at the interaction of violence? To investigate volatility is enhanced by the state behavior . 3) exhaustive definition of 290 incidents of their dynamics with them to policymakers and early 1990s , if certain organizations . e . Using a longitudinal study certainly facilitates research on the rational , the British India , and culture and participated in shaping nationalist sentiments have benefitted


In [11]:
import re

def getNgrams(content, n):
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    content = content.split(' ')
    content = [word for word in content if word != '']
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

In [12]:
from collections import Counter

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)

In [13]:
print(getNgrams(content, 2))

Counter({'UNITED STATES': 55, 'FOREIGN POLICY': 40, 'COLD WAR': 34, 'WORLD WAR': 31, 'INTERNATIONAL RELATIONS': 25, 'NUCLEAR WEAPONS': 25, 'CASE STUDIES': 22, 'DISSERTATION EXAMINES': 17, 'RELATIONSHIP BETWEEN': 17, 'DECISION MAKERS': 15, 'WAR II': 14, 'CONDITIONS UNDER': 13, 'INTERNATIONAL POLITICS': 13, 'MILITARY INTERVENTIONS': 13, 'INTERNATIONAL SYSTEM': 12, 'CASE STUDY': 12, 'CIVIL WAR': 12, 'CIVIL-MILITARY RELATIONS': 11, 'GREAT WAR': 11, 'TERRITORIAL DISPUTES': 11, 'VIETNAM WAR': 11, 'MILITARY EFFECTIVENESS': 10, 'SOVIET UNION': 10, 'INTERNATIONAL LAW': 10, 'AUDIENCE COSTS': 10, 'JET AIRCRAFT': 10, 'CIVILIAN CONTROL': 9, 'STATES ARE': 9, 'TWENTIETH CENTURY': 9, 'LEADERS ARE': 9, 'LEADERSHIP DECAPITATION': 9, 'KOREAN WAR': 8, 'UNITED NATIONS': 8, 'THIRD PARTY': 8, 'GREAT POWER': 8, 'THIRD WORLD': 8, 'OVERT COLLECTIVE': 8, 'COLLECTIVE CHALLENGES': 8, 'REBEL GROUPS': 8, 'MILITARY INSTITUTIONS': 7, 'MILITARY CAPABILITIES': 7, 'MIDDLE EAST': 7, 'STATE DEATH': 7, 'NATIONAL INTERESTS':

In [14]:
def isCommon(ngram):
    commonWords = ['THE', 'BE', 'AND', 'OF', 'A', 'IN', 'TO', 'HAVE', 'IT', 'I', 'THAT', 'FOR', 'YOU', 'HE', 'WITH', 'ON', 'DO', 'SAY', 'THIS', 'THEY', 'IS', 'AN', 'AT', 'BUT', 'WE', 'HIS', 'FROM', 'THAT', 'NOT', 'BY', 'SHE', 'OR', 'AS', 'WHAT', 'GO', 'THEIR', 'CAN', 'WHO', 'GET', 'IF', 'WOULD', 'HER', 'ALL', 'MY', 'MAKE', 'ABOUT', 'KNOW', 'WILL', 'AS', 'UP', 'ONE', 'TIME', 'HAS', 'BEEN', 'THERE', 'YEAR', 'SO', 'THINK', 'WHEN', 'WHICH', 'THEM', 'SOME', 'ME', 'PEOPLE', 'TAKE', 'OUT', 'INTO', 'JUST', 'SEE', 'HIM', 'YOUR', 'COME', 'COULD', 'NOW', 'THAN', 'LIKE', 'OTHER', 'HOW', 'THEN', 'ITS', 'OUR', 'TWO', 'MORE', 'THESE', 'WANT', 'WAY', 'LOOK', 'FIRST', 'ALSO', 'NEW', 'BECAUSE', 'DAY', 'MORE', 'USE', 'NO', 'MAN', 'FIND', 'HERE', 'THING', 'GIVE', 'MANY', 'WELL']
    for word in ngram:
        if word in commonWords:
            return True
    return False

def getNgramsFromSentence(text, n):
    output = []
    for i in range(len(text)-n+1):
        if not isCommon(text[i:i+n]):
            output.append(text[i:i+n])
    return output

ngrams = getNgrams(text, 3)
print(ngrams)


Counter({'WORLD WAR II': 14, 'OVERT COLLECTIVE CHALLENGES': 7, 'COLD WAR ARMY': 6, 'LEADERS PROJECT POWER': 6, "THIRD PARTY'S DECISION": 4, 'GUATEMALAN NATIONAL POLICE': 4, 'NUCLEAR NONPROLIFERATION REGIME': 4, 'INTERNATIONAL RELATIONS SCHOLARS': 3, 'SETTLE TERRITORIAL DISPUTES': 3, 'ARMS CONTROL AGREEMENTS': 3, 'THIRD PARTY STATES': 3, '2ND VIETNAM WAR': 3, 'AMERICAN FOREIGN POLICY': 3, 'RELATIONSHIP BETWEEN POWER': 3, 'SECOND WORLD WAR': 3, 'DURING WORLD WAR': 3, 'HISTORICAL CASE STUDIES': 3, 'NUCLEAR WEAPONS ACQUISITION': 3, 'UNDERMINE OVERT COLLECTIVE': 3, 'PRECEDE AND/OR SUPPORT': 3, 'AND/OR SUPPORT SUCH': 3, 'SUPPORT SUCH BEHAVIOR': 3, 'CULPABLE LEADERS ARE': 3, 'BETWEEN KURDISH LEADERS': 3, 'SHARED SOCIAL IDENTITIES': 3, 'SOCIAL IDENTITIES ARE': 3, 'CIVIL-MILITARY RELATIONS ARE': 2, 'DIFFERENT INSTITUTIONAL SETTINGS': 2, 'EGYPTIAN-ISRAELI PEACE TREATY': 2, 'FOREIGN POLICY MAKING': 2, 'NORM PROTECTING STATES': 2, 'PROTECTING STATES TERRITORIAL': 2, 'STATES TERRITORIAL SOVEREIGNTY