In [2]:
%matplotlib inline
import os
import re
import glob
from collections import Counter
from collections import defaultdict

import numpy as np
import pandas as pd

import file_handling as fh



In [3]:
articles_file = 'data/shared/articles.csv'
spreadsheet_file = 'Stanford_MSA/Stanford_MSA_Database.csv'
output_dir = 'data/output'

msa_df = pd.read_csv(spreadsheet_file, header=0)
articles_df = pd.read_csv(articles_file, header=0, index_col=0, encoding='Latin-1')

In [4]:
msa_df.head(n=3)

Unnamed: 0,CaseID,Title,Location,City,State,Latitude,Longitude,Number of Civilian Fatalities,Number of Civilian Injured,Number of Enforcement Fatalities,...,Data Source 3,Data Source 4,Data Source 5,Data Source 6,Data Source 7,Military Experience,Class,Depreciation,Notes,Edit Date
0,1,University of Texas at Austin,"Austin, Texas",Austin,Texas,30.198887,-97.844159,15,32,1,...,http://news.google.com/newspapers?id=lkk0AAAAI...,http://news.google.com/newspapers?id=PPUjAAAAI...,http://books.google.com/books?id=ClYEAAAAMBAJ&...,,,Yes,SPK,1,,6/8/2016
1,2,Rose-Mar College of Beauty,"Mesa, Arizona",Mesa,Arizona,33.422687,-111.81632,5,1,0,...,http://www.nydailynews.com/news/crime/beauty-s...,http://books.google.com/books?id=Cre7qsswRiwC&...,,,,Unknown,MS,1,,6/8/2016
2,3,New Orleans Police Shootings,"New Orleans, Louisiana",New Orleans,Louisiana,30.068724,-89.931474,4,8,5,...,http://www.trutv.com/library/crime/notorious_m...,http://books.google.com/books?id=TfEDmROcZwEC&...,,,,Unknown,SPK,1,,6/22/2016


In [5]:
articles_df.head(n=3)

Unnamed: 0,id,caseid,title,city,n_fatalities,n_victims,n_shooters,name,description,article,article_name,shooter_names,prop_names_found,age_found,city_found,matching,age,order
0,31,32,GMAC Loan Office,Jacksonville,12,17,1,James Edward Pough,After fatally shooting a man and woman and wou...,Stunned city seeks answers. The voice on the t...,32_James_Edward_Pough-12_short.txt,James Edward Pough,1.0,0,1,1,42,5416
1,31,32,GMAC Loan Office,Jacksonville,12,17,1,James Edward Pough,After fatally shooting a man and woman and wou...,TAKING AIM AT ITS RIVALS. RoboCop 2 +++ Direct...,32_James_Edward_Pough-13_short.txt,James Edward Pough,0.0,0,0,0,42,828
2,31,32,GMAC Loan Office,Jacksonville,12,17,1,James Edward Pough,After fatally shooting a man and woman and wou...,Man gets 75 years for slaying. Brent W. Freiho...,32_James_Edward_Pough-10_short.txt,James Edward Pough,0.0,0,0,0,42,2269


In [6]:
# explore the current coding of certain columns
print(set(msa_df['Place Type'].values), len(set(msa_df['Place Type'].values)))
print(set(msa_df['Shooter Race'].values))
# etc.


{'Restaurant/Cafe', 'Unknown', 'Public transportation', 'Secondary school', 'Medical/Care', 'Retail/ Wholesale/Services facility', 'Retail/Wholesale/Services facility', 'Military facility', 'Restaurant/Cafe?', 'Residential Home/Neighborhood', 'Retail/Wholesale/Services facility\n/Residential home/Neighborhood', 'Park/Wilderness', 'Street/Highway', 'Place of worship', 'Residential home/Neighborhood \nand Street/Highway', 'Residential home/Neighborhood', 'Government facility', 'Residential home/Neighborhood,\nRetail/ Wholesale/Services facility', 'Entertainment venue', 'Secondary School', 'Primary school', 'Park/Wildness', 'Restaurant/Cafeé', 'College/University/Adult education', 'Entertainment Venue', 'Company/Factory/Office', 'Restaurant/cafe', 'Residential home', 'Public Transportation', 'Retail/ Wholesale/Services facility\nand Primary school'} 30
{'Asian American/Some other race', 'Two or more races', 'White American or European American/Some other Race', 'Black American or African 

In [10]:
import re
import string
replace = re.compile('[%s]' % re.escape(string.punctuation))

# make a copy of original dataframe
msa_df_copy = pd.DataFrame(msa_df.values, index=msa_df.index, columns=msa_df.columns)

# create searches
search_terms = ['terrorism_or_terrorist', 'muslim_or_islam', 'mental']

# initalize columns to zero
msa_df_copy['n_articles'] = 0
for search in search_terms:
    msa_df_copy[search] = 0

# process each article in turn
for i in articles_df.index:
    row = articles_df.loc[i]
    # match up to the row in the original MSA database
    msa_index = row['id']
    
    # include articles that mention the person's name, XX-year-old of the right age, or the city
    #if row['prop_names_found'] > 0 or row['age_found']:
    if row['prop_names_found'] > 0 or row['age_found'] or row['city_found']:
        # count the total number of articles
        msa_df_copy.loc[msa_index, 'n_articles'] += 1
        
        # get the text of the article
        text = row['article'].lower()
        text = replace.sub(' ', text)
        text = ' ' + text + ' '
        
        # look for each set of search terms
        for search in search_terms:
            # split the search into individual terms
            terms = search.split('_or_')
            found = False

            # look for each term in the search
            for term in terms:
                if ' ' + term + ' ' in text:
                    found = True

            # add one if any of these search terms is found
            if found:
                msa_df_copy.loc[msa_index, search] += 1

print(msa_df_copy['n_articles'].sum())

# zero the counts for the Cape Coral incident, because they are all about the Orlando event
cape_coral_index = 331
msa_df_copy.loc[cape_coral_index, 'n_articles'] = 0
for search in search_terms:
    msa_df_copy.loc[cape_coral_index, search] = 0

# save the data
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
msa_df_copy.to_csv(os.path.join(output_dir, 'word_counts.csv'))

# display a few rows
msa_df_copy.tail(n=4)


4204


Unnamed: 0,CaseID,Title,Location,City,State,Latitude,Longitude,Number of Civilian Fatalities,Number of Civilian Injured,Number of Enforcement Fatalities,...,Data Source 7,Military Experience,Class,Depreciation,Notes,Edit Date,n_articles,terrorism_or_terrorist,muslim_or_islam,mental
331,341,Cape Coral Shooting Spree,"Cape Coral, Florida",Cape Coral,Florida,26.5629,-81.9495,2,3,0,...,,Unknown,MS,1,,6/29/2016,0,0,0,0
332,342,"Webster, MN party shooting","Webster, Minnesota",Webster,Minnesota,44.5297,-93.3527,0,4,0,...,http://www.gunviolencearchive.org/incident/578238,Unknown,MS,1,,7/6/2016,0,0,0,0
333,343,Orlando Nightclub Massacre,"Orlando, Florida",Orlando,Florida,28.5383,-81.3792,49,52,0,...,http://www.gunviolencearchive.org/incident/577157,No,MS,1,,7/6/2016,394,203,97,30
334,344,Nightclub Disput,"Lyman, South Carolina",Lyman,South Carolina,34.9482,-82.1273,0,3,0,...,,Unknown,MS,1,,7/28/2016,0,0,0,0


In [51]:
# check articles on a particular case (a false positive in this case)

for i in articles_df.index:
    row = articles_df.loc[i]

    # match up to the row in the original MSA database
    msa_index = row['id']
    
    if msa_index == 292:
    
        # include articles that mention the person's name, XX-year-old of the right age, or the city
        if row['prop_names_found'] > 0 or row['age_found'] or row['city_found']:
            print(i)
            print(row['title'])
            print(row['article'])

5060
Louisville, KY Family Murder-Suicide
Muslims shoulder outbreak of anger, fear. Cities across the USA are preparing for the phase that inevitably follows a terror attack: anti-Muslim backlash. Across social media, in public forums on college campuses and in political rhetoric by presidential candidates, anger over the deadly terror attacks in Brussels spawned discontent and suspicion directed at Muslim groups. After the Islamic State claimed responsibility for the attacks, leaders in Ohio, Kentucky, New York, New Jersey and California spoke out quickly to dissuade anti-Muslim sentiment. The aftermath of an attack "is always a difficult time for Muslims in the United States," says Nabil Shaikh, a leader of the Muslim Students Association at Princeton University. "On Princeton's campus, students took to anonymous forums  ... to comment that there are Muslims at Princeton who are radical and would therefore condone yesterday's attacks," Shaikh said. "These comments ... threaten the we