In [2]:
#Import required modules
import requests
import csv
from bs4 import BeautifulSoup

First, read in file search_terms.csv, which has rebel groups in the
first column, and the governments they were fighting in the second.
This is for Google searching.  Most of the rebel groups names alone
will not bring up their Wikipedia pages because they tend to have
similar names to each other and/or be listed by acronyms.

The value for government is usually where the conflict takes place,
but not always (such as anti-colonial conflicts).  I tested this and
even if the government isn't the same as the location of conflict,
the first page to come up was always the correct one, at least for
the ones that I tried.

There are sometimes multiple entries for the same conflict.  I need
this in the dataset for other reasons; it's not important for this
project.  I will removed the duplicates for the visualizations later.

In [2]:
search = []
with open("search_terms.csv", "rU") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        search.append(row) #Loop through each row and add to list.

In [3]:
#Look at a chunk to make sure it's working:
search[0:5] #So far so good.

[{'govt': 'Bolivia', 'group': 'Popular Revolutionary Movement'},
 {'govt': 'Bolivia', 'group': 'MNR'},
 {'govt': 'Bolivia', 'group': 'ELN'},
 {'govt': 'France', 'group': 'Khmer Issarak'},
 {'govt': 'China', 'group': 'Peoples Liberation Army'}]

In [4]:
#The spaces in the group names are going to mess up the URL.
#Google would put "+" between the words instead of spaces, so I'll do that too.

for pair in search:
    pair["govt"] = pair["govt"].replace(" ", "+") #Replace spaces with +s.
    pair["group"] = pair["group"].replace(" ", "+")
    
search[0:5]

[{'govt': 'Bolivia', 'group': 'Popular+Revolutionary+Movement'},
 {'govt': 'Bolivia', 'group': 'MNR'},
 {'govt': 'Bolivia', 'group': 'ELN'},
 {'govt': 'France', 'group': 'Khmer+Issarak'},
 {'govt': 'China', 'group': 'Peoples+Liberation+Army'}]

Now we need to loop through the dictionaries and get the URLs to Google each pair,
plus "Wikipedia" to get the Wikipedia links.

In [5]:
#Making a list of the Google URLS:

urls = []

for pair in search:
    url = "https://www.google.com/search?sclient=psy-ab&client=ubuntu&hs=k5b&channel=fs&biw=1366&bih=648&noj=1&q=" + str(pair["group"]) + "+" + str(pair["govt"]) + "+Wikipedia"
    #Don't ask me what all that stuff in the beginning of the URL is
    #because i have no idea; the internet told me to put it there.
    #It doesn't work with just the "https://www.google.com/#q=" from class.
    urls.append(url)

#Test:
urls[0:5]

['https://www.google.com/search?sclient=psy-ab&client=ubuntu&hs=k5b&channel=fs&biw=1366&bih=648&noj=1&q=Popular+Revolutionary+Movement+Bolivia+Wikipedia',
 'https://www.google.com/search?sclient=psy-ab&client=ubuntu&hs=k5b&channel=fs&biw=1366&bih=648&noj=1&q=MNR+Bolivia+Wikipedia',
 'https://www.google.com/search?sclient=psy-ab&client=ubuntu&hs=k5b&channel=fs&biw=1366&bih=648&noj=1&q=ELN+Bolivia+Wikipedia',
 'https://www.google.com/search?sclient=psy-ab&client=ubuntu&hs=k5b&channel=fs&biw=1366&bih=648&noj=1&q=Khmer+Issarak+France+Wikipedia',
 'https://www.google.com/search?sclient=psy-ab&client=ubuntu&hs=k5b&channel=fs&biw=1366&bih=648&noj=1&q=Peoples+Liberation+Army+China+Wikipedia']

When I first tried looping through the URLs and saving the first response, there were 5-10 of them that were breaking the loop because of strange characters in the URL.  It turned out that in the dataset I was using, accent marks had been replaced by blank spaces, deleting the letter as well as the accent mark.  I changed these by hand in the csv file to the names that they were supposed to be.  I couldn't think of any way to code this because since the original letter was deleted, you had to know what the group was actually called in order to correct it (couldn't just remove accent marks).

In [259]:
#Now, loop through the URLs and save the server responses, as text and parsed.

In [6]:
gRes = []

for url in urls:
    res = requests.get(url) #GET request
    res_text = res.text #Save response as text
    res_parse = BeautifulSoup(res_text, "html.parser") #Parse results
    gRes.append(res_parse) #Save parsed results in the list.


In [7]:
len(gRes) #Hooray!

578

But how to get just the first link out of this?  It looks like a mess.
Luckily Google uses the tag "cite" for links, so can just look for it.

In [8]:
first_links = [] #blank list to store links

for result in gRes:
    first_link = result.select("cite")[0].text #This gets the first link and saves it as text.
    first_links.append(first_link) #add links to list

first_links[0:5] #Awesome!

[u'https://en.wikipedia.org/wiki/Revolutionary_Nationalist_Movement',
 u'https://en.wikipedia.org/wiki/Revolutionary_Nationalist_Movement',
 u'https://en.wikipedia.org/wiki/ELN',
 u'https://en.wikipedia.org/wiki/Khmer_Issarak',
 u"https://en.wikipedia.org/wiki/People's_Liberation_Army"]

In [9]:
len(first_links)

578

In [10]:
#Added this later because I realized that the CSV wouldn't save without it!
Links = []

for link in first_links:
    Links.append(link.encode("utf8"))

len(Links)

578

I need to make a list of dictionaries, in which each dictionary has the group name, the government, and first link.  Right now, the first links and the list of dictionaries I used to search for them are the same lenght and in the same order.

In [12]:
bigList = []

for i in range(len(Links)): #Loop through links
    dictionary = {} #Create an empty dictionary
    dictionary["wiki_link"] = Links[i] #adds Wiki link to dictionary
    dictionary["group"] = search[i]["group"] #adds group name
    dictionary["govt"] = search[i]["govt"] #adds government
    bigList.append(dictionary) #adds dictionary for each group to the list


bigList[100:110] #yes it's working

[{'govt': 'United+Kingdom',
  'group': 'Mau+Mau',
  'wiki_link': 'https://en.wikipedia.org/wiki/Mau_Mau_Uprising'},
 {'govt': 'Cuba',
  'group': 'Military+Faction+-+26th+of+July+Movement',
  'wiki_link': 'https://en.wikipedia.org/wiki/26th_of_July_Movement'},
 {'govt': 'Cuba',
  'group': 'Military+Faction+-+26th+of+July+Movement',
  'wiki_link': 'https://en.wikipedia.org/wiki/26th_of_July_Movement'},
 {'govt': 'Cuba',
  'group': 'National+Revolutionary+Council',
  'wiki_link': 'https://en.wikipedia.org/wiki/Cuban_Revolutionary_Council'},
 {'govt': 'Indonesia',
  'group': 'Darul+Islam+Movement',
  'wiki_link': 'https://en.wikipedia.org/wiki/Darul_Islam_(Indonesia)'},
 {'govt': 'Indonesia',
  'group': 'Darul+Islam+Movement',
  'wiki_link': 'https://en.wikipedia.org/wiki/Darul_Islam_(Indonesia)'},
 {'govt': 'Indonesia',
  'group': 'PRRI',
  'wiki_link': 'https://id.wikipedia.org/wiki/Pemerintahan_Revolusioner_Republik_ Indonesia'},
 {'govt': 'Indonesia',
  'group': 'Permesta+movement',
  

In [None]:
#Let's save this so that I don't have to run this code anymore.

In [8]:
keys = bigList[0].keys() # get a list of the keys, which will be column names.
with open('BigList.csv', 'wb') as output_file:
     dict_writer = csv.DictWriter(output_file, keys)
     dict_writer.writeheader()
     dict_writer.writerows(bigList)

In [None]:
#Now we'll open the CSV as bigList again so that if anything goes wrong, I can restart from this point.

In [9]:
bigList = []

with open("BigList.csv", "rU") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        bigList.append(row)

In [10]:
bigList[1:5] #good

[{'govt': 'Bolivia',
  'group': 'MNR',
  'ideologies': "[u'Nationalism', u'Populism']",
  'wiki_link': 'https://en.wikipedia.org/wiki/Revolutionary_Nationalist_Movement'},
 {'govt': 'Bolivia',
  'group': 'ELN',
  'ideologies': '',
  'wiki_link': 'https://en.wikipedia.org/wiki/ELN'},
 {'govt': 'France',
  'group': 'Khmer+Issarak',
  'ideologies': '',
  'wiki_link': 'https://en.wikipedia.org/wiki/Khmer_Issarak'},
 {'govt': 'China',
  'group': 'Peoples+Liberation+Army',
  'ideologies': '',
  'wiki_link': "https://en.wikipedia.org/wiki/People's_Liberation_Army"}]

Now we need a function that will take each of the URLs in the list as inputs and then take the contents of the box in the upper right hand corner of the Wiki page.

In [11]:
def wiki_ideo(URL):
    if "https://en.wikipedia.org/wiki" in URL: #Make sure it's a Wikipedia link.
        soup = BeautifulSoup(requests.get(URL).text) #make a get request, read as text, soup it
    
        if "infobox vcard" in str(soup): #this makes sure the page has the box in the right corner
            table = soup.select("table.infobox.vcard")[0] #Looking at the box in the right corner
            rows = table.select("tr") #Getting the rows.
    
    #The part below loops through the row, and if ideology is in the row, then it saves the
    #links in the row, which will be the values of "ideology."  Then it takes the text of
    #those links and returns them.
    
            for row in rows:
                if "Ideology" in row.text:
                    ideology = row
                    if "category" in str(ideology):
                        keywords = ideology.select("td.category")
                        links = keywords[0].select("a")
                        words = [link.text for link in links]
                        return words #List of ideologies associated with group.

#The if statements are because it just kept breaking when it ran across pages that were
#ormatted differently and/or were the wrong pages.

In [12]:
wiki_ideo(bigList[0]["wiki_link"])
#Okay, it works for one of them...

[u'Nationalism', u'Populism']

In [13]:
#Now loop through all the dictionaries in the list.

for dictionary in bigList: #loop through dictionaries for each group
    page = dictionary["wiki_link"] #use Wiki page for group
    ideologies = [] #blank list for ideologies as there tend to be several
    ideologies = wiki_ideo(page) #use the function that gets the ideologies from the wiki page
    dictionary["ideologies"] = ideologies #save a new key ideologies with value of group's ideos

In [14]:
print len(bigList)
bigList[0]

578


{'govt': 'Bolivia',
 'group': 'Popular+Revolutionary+Movement',
 'ideologies': [u'Nationalism', u'Populism'],
 'wiki_link': 'https://en.wikipedia.org/wiki/Revolutionary_Nationalist_Movement'}

Right.  So the good news is - it works!  And the bad news is... as we'll soon see, most of the Wikipedia pages don't have the box in the corner with ideology in it, so it didn't work for a lot of them.


In [15]:
#I wrote another CSV and opened it again as a checkpoint here because
#I kept accidentally overwriting things and having to start over.

keys = bigList[0].keys()
with open('ideo-data.csv', 'wb') as output_file:
     dict_writer = csv.DictWriter(output_file, keys)
     dict_writer.writeheader()
     dict_writer.writerows(bigList)

In [16]:
bigList = []

with open("ideo-data.csv", "rU") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        bigList.append(row)

bigList[0] #good

{'govt': 'Bolivia',
 'group': 'Popular+Revolutionary+Movement',
 'ideologies': "[u'Nationalism', u'Populism']",
 'wiki_link': 'https://en.wikipedia.org/wiki/Revolutionary_Nationalist_Movement'}

So, now we have a bunch of lists of ideologies, but many of them are too specific to be useful for analysis.  For example, we want "Kurdish Nationalism" to be just "Nationalism," and we want to group "Marxism," "Leninism," and "Communism" together.

<u> Coding rules </u>

Major ideologies:

-Any group/country/region nationalism, independence, separatism, or self-determination will get a 1 for nationalism

-Anything with Marxism, Leninism, Maoism, or Communism will get a 1 for communist.

-Anything with Islamism, sect of Islam, or clericalism gets a 1 for Islamist.

-Anything democracy gets 1 for democracy

Ignoring the rest for now, as there are not many observations.

In [17]:
for dictionary in bigList:
    ideologies = dictionary["ideologies"]
    
    #Checking for nationalisms.
    if "ationalism" in ideologies:
        #Taking off the first letter for capitalization
        #and the end to allow for variants
        dictionary["Nationalist"] = 1
    
    elif "ndependence" in ideologies:
        dictionary["Nationalist"] = 1
    
    elif "eparatis" in ideologies:
        dictionary["Nationalist"] = 1
    
    elif "self-determin" in ideologies:
        dictionary["Nationalist"] = 1
    
    else:
        dictionary["Nationalist"] = 0
    
    #Checking for communisms.
    if "ommunis" in ideologies:
        dictionary["Communist"] = 1
    
    elif "arxis" in ideologies:
        dictionary["Communist"] = 1
    
    elif "eninis" in ideologies:
        dictionary["Communist"] = 1
    
    elif "Mao" in ideologies:
        dictionary["Communist"] = 1
    
    else:
        dictionary["Communist"] = 0
    
    #Checking for Islamism.
    if "slamis" in ideologies:
        dictionary["Islamist"] = 1
    
    elif "Sunni" in ideologies:
        dictionary["Islamist"] = 1
    
    elif "Shia" in ideologies:
        dictionary["Islamist"] = 1
    
    elif "lericalis" in ideologies:
        dictionary["Islamist"] = 1
    
    else:
        dictionary["Islamist"] = 0
    
    #Checking for democracy
    if "emocra" in ideologies:
        dictionary["Democracy"] = 1
    
    else:
        dictionary["Democracy"] = 0

bigList[0]

{'Communist': 0,
 'Democracy': 0,
 'Islamist': 0,
 'Nationalist': 1,
 'govt': 'Bolivia',
 'group': 'Popular+Revolutionary+Movement',
 'ideologies': "[u'Nationalism', u'Populism']",
 'wiki_link': 'https://en.wikipedia.org/wiki/Revolutionary_Nationalist_Movement'}

In [20]:
#Update the CSV.

keys = bigList[0].keys()
with open('ideo-data.csv', 'wb') as output_file:
     dict_writer = csv.DictWriter(output_file, keys)
     dict_writer.writeheader()
     dict_writer.writerows(bigList)

Because I didn't wind up with enough cases to do the analysis, I am instead going to webscrape the content of the Wikipedia pages for the cases that did work, and do some text analysis on that.

In [23]:
#Restarted the kernal again and had to open the data again.

bigList = []

with open("ideo-data.csv", "rU") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        bigList.append(row)

bigList[1:3] #good

[{'Communist': '0',
  'Democracy': '0',
  'Islamist': '0',
  'Nationalist': '1',
  'govt': 'Bolivia',
  'group': 'MNR',
  'ideologies': "[u'Nationalism', u'Populism']",
  'wiki_link': 'https://en.wikipedia.org/wiki/Revolutionary_Nationalist_Movement'},
 {'Communist': '0',
  'Democracy': '0',
  'Islamist': '0',
  'Nationalist': '0',
  'govt': 'Bolivia',
  'group': 'ELN',
  'ideologies': '',
  'wiki_link': 'https://en.wikipedia.org/wiki/ELN'}]

In [25]:
#Now, I only want to use the ones with the ideology field, so let's put those in their own list.

shortList = []

for group in bigList:
    if group["ideologies"] != "":
        shortList.append(group)
    
    
#shortList[0:15] #good
len(shortList)

#Only 126 of 578 :(

126

In [27]:
#Now I'm going to webscrape again, but this time for the content of the Wikipedia page.

def wiki_content(URL):
    if "https://en.wikipedia.org/wiki" in URL: #Make sure it's a Wikipedia link.
        soup = BeautifulSoup(requests.get(URL).text) #make a get request, read as text, soup it
        if "mw-content" in str(soup): #makes sure the page is set up in the same way
            #print URL
            content = soup.select("div.mw-content-ltr")[0].text #gets the content
            return content


In [29]:
#wiki_content(shortList[0]["wiki_link"])
#Okay, so it works for one of them.  Now to loop through them all and save the content.


In [30]:
#I don't know why this happened, but I have to get rid of the ones with this
#weird series of characters because otherwise they break the function.

for dictionary in shortList:
    if "\xe4\xf3\xf1" in dictionary["wiki_link"]:
        shortList.remove(dictionary)

for dictionary in shortList:
    if "Mart\xed" in dictionary["wiki_link"]:
        shortList.remove(dictionary)
 
for dictionary in shortList:
    if "C\xed\xc7te" in dictionary["wiki_link"]:
        shortList.remove(dictionary)

for dictionary in shortList:
    if "\xe4\xf3\xf1" in dictionary["wiki_link"]:
        shortList.remove(dictionary)
        

In [38]:
#len(shortList)

In [33]:
for dictionary in shortList:
    page = dictionary["wiki_link"]
    contents = wiki_content(page).encode("utf8")
    dictionary["content"] = contents


In [35]:
#shortList[0:2] #works

In [36]:
#Remove \n and \r, which are messing with CSV formatting.


for dictionary in shortList:
    dictionary["content"] = dictionary["content"].replace("\n", "")


for dictionary in shortList:
    dictionary["content"] = dictionary["content"].replace("\r", "")

In [37]:
#Export to CSV.

keys = shortList[0].keys()
with open('content-ideo.csv', 'wb') as output_file:
     dict_writer = csv.DictWriter(output_file, keys)
     dict_writer.writeheader()
     dict_writer.writerows(shortList)

In [None]:
#The end!