In [2]:
import pandas as pd
import bz2
import json
from os import listdir

In [3]:
from empath import Empath
lexicon = Empath()

In [4]:
# Generate a lexicon of keywords that relate to gun violence
lexicon.create_category("gun_violence", ["gun_violence","mass_shooting", "firearm", "firearm_shooting"],
                        model="reddit")
# We found that the empath libary is not effective for detecting quotes that pertain to gun violence using this
# library, we therefore chose to save the keywords generated by this function and manually query each line of our 
# dataset. 

["mass_shooting", "gun_violence", "mass_shootings", "gun_crime", "gun_ownership", "gun_crimes", "firearms", "firearm", "CCW_holders", "legal_gun_owners", "concealed_carriers", "violent_crime", "gun_owner", "accidental_shootings", "fire_arms", "gun_owners", "gun_death", "shootings", "gun_use", "police_shootings", "gun_control_laws", "home_invasions", "gun_deaths", "fire_arm", "US_police", "strict_gun_control", "active_shooters", "concealed_carry", "firearm_owners", "homicide", "home_invasion", "firearm_ownership", "mass_shooter", "violent_crimes", "accidental_shooting", "gun_ban", "handguns", "most_gun_owners", "school_shooting", "armed_criminals", "legal_guns", "responsible_gun_owner", "school_shootings", "assault_weapons", "illegal_guns", "mass_shooters", "law_enforcement_officers", "conceal_carry", "gun_possession", "legal_gun_ownership", "homicides", "related_crime", "strict_gun_laws", "CCW", "fully_automatic_weapons", "related_homicides", "gun_culture", "responsible_gun_ownership",

In [5]:
# Store relevant keywords in list
keywords = ["mass shooting", "gun violence", "gun crime", "gun owner", "firearm", "gun possession",
            "accidental shooting", "gun death", "gun law", "gun control", "firearm ownership", 
            "mass shooter", "school shooting", "gun death", "firearm owners", "gun ban", "handguns", 
            "most gun owners", "assault weapon", "legal gun", "fully automatic weapon", "gun culture",
            "gun ownership", "gun accidents", "armed citizen", "concealed carry", "concealed carrier",
            "legal firearm", "legal gun", "gun homicide", "gun advocate", "negligent discharge", 
            "accidental discharge", "firearm violence", "firearm-related violence"]


In [6]:
def keyword(row):
    '''
    Filter the quotation of a particular row of a dataframe with a set of keywords.
    :param row: dataframe object
    :return: list of comma separated matching keywords
    '''
    keywords = ["mass shooting", "gun violence", "gun crime", "gun owner", "firearm", "gun possession",
            "accidental shooting", "gun death", "gun law", "gun control", "firearm ownership", 
            "mass shooter", "school shooting", "gun death", "firearm owners", "gun ban", "handguns", 
            "most gun owners", "assault weapon", "legal gun", "fully automatic weapon", "gun culture",
            "gun ownership", "gun accidents", "armed citizen", "concealed carry", "concealed carrier",
            "legal firearm", "legal gun", "gun homicide", "gun advocate", "negligent discharge", 
            "accidental discharge", "firearm violence", "firearm-related violence"]    
    
    strings = row['quotation']
    key_word = [key for key in keywords if key.upper() in strings.upper()]
    return ', '.join(key_word)

In [7]:
# Processes one chunk of data: filters rows to keep only those containing at least one keyword, and saves the chunk
# to hard drive. 
def process_chunk(year, chunk, idx):
    print(f'Processing chunk number: {idx} from year: {year}')
    
    # Select only columns of interest from data
    selected_columns = chunk[["speaker", "qids", "date", "quotation"]]
    filtered_chunk = selected_columns.copy()
    
    # Apply filter to keep only columns containing at least a keyword from the list defined above
    filtered_chunk['keyword'] = filtered_chunk.apply(keyword, axis=1)
    filtered_chunk = filtered_chunk[filtered_chunk.keyword != '']
    
    # Save the chunk to hard drive to free up active memory
    filtered_chunk.to_pickle("/Users/Justin/Desktop/ADA_project/processed_dfs/" + str(year) + "/" + str(idx) + 
                             "chunk_with_keyword.pkl", compression='infer', protocol=4)

    

In [8]:
# Iterate over all years of data instantly
years = [2015, 2016, 2017, 2018, 2019]

for year in years:
    # Open the entire dataset for a year and split into chunks of size chunksize: 
    df_reader = pd.read_json('/Users/Justin/Desktop/ADA_project/Quote_Bank/quotes-' + str(year) + '.json.bz2',
                         lines=True, compression='bz2', chunksize=1500000)
    
    # idx refers to chunk number: 
    idx = 0
    
    # Iterate over all chunks of a year of data:
    for chunk in df_reader:
        idx += 1
        # Apply function to filter out data, and keep only quotes with keywords
        # Function also saves each chunk to hard drive once done to speed up execution
        process_chunk(year, chunk, idx)
    

Processing chunk number: 1 from year: 2015
Processing chunk number: 2 from year: 2015
Processing chunk number: 3 from year: 2015
Processing chunk number: 4 from year: 2015
Processing chunk number: 5 from year: 2015
Processing chunk number: 6 from year: 2015
Processing chunk number: 7 from year: 2015
Processing chunk number: 8 from year: 2015
Processing chunk number: 9 from year: 2015
Processing chunk number: 10 from year: 2015
Processing chunk number: 11 from year: 2015
Processing chunk number: 12 from year: 2015
Processing chunk number: 13 from year: 2015
Processing chunk number: 14 from year: 2015
Processing chunk number: 1 from year: 2016
Processing chunk number: 2 from year: 2016
Processing chunk number: 3 from year: 2016
Processing chunk number: 4 from year: 2016
Processing chunk number: 5 from year: 2016
Processing chunk number: 6 from year: 2016
Processing chunk number: 7 from year: 2016
Processing chunk number: 8 from year: 2016
Processing chunk number: 9 from year: 2016
Proces

In [38]:
# Function which sequentially opens all pre-processed chunks and saves them to a single list
def open_chunks(year, list_dfs):
    common_path = '/Users/Justin/Desktop/ADA_project/processed_dfs/'
    file_name = 'chunk_with_keyword.pkl'
    
    for file in listdir(common_path + str(year) + '/'):
        
        # The 2015 folder has an invisible file named ".DS_Store" which we don't wont want
        # to iterate over (otherwise the function fails)
        if file == '.DS_Store':
            continue
            
        small_df = pd.read_pickle(common_path + str(year) + '/' + file, compression='infer')
        list_dfs.append(small_df)
    
    return list_dfs


In [39]:
# Iterate over all years of data instantly
years = [2015, 2016, 2017, 2018, 2019]

# The final dataframe to store the filtered Quote Bank data:
final_df = pd.DataFrame()

# execute the open_chunks function for all years of dataset and save the result to final_df: 
for year in years: 
    list_dfs = []
    temp_df = pd.concat(open_chunks(year, list_dfs), ignore_index=True)
    final_df = pd.concat([final_df, temp_df])
    

In [40]:
final_df.head(30)

Unnamed: 0,speaker,qids,date,quotation,keyword
0,Dan Bongino,[Q16200445],2015-04-14 15:37:28,I think you were almost setting yourself up fo...,gun crime
1,,[],2015-06-17 16:00:00,It is truly unfortunate that legislators spent...,firearm
2,Mark Howell,"[Q16229080, Q6768121]",2015-05-17 22:11:17,99.99% of the time where someone brings a fire...,firearm
3,,[],2015-02-03 21:21:45,At this point we are investigating the possibi...,firearm
4,Peter Doocy,[Q7173707],2015-12-04 02:14:07,So the president thinks that when there are tw...,gun law
5,Greg Abbott,"[Q5605224, Q5605225]",2015-12-31 19:21:35,"symbolic in retaining some liberty, similar to...","gun owner, gun ownership"
6,,[],2015-09-11 07:42:30,"conceal carr handguns, big handguns. it doesn'...",handguns
7,Michael Moore,"[Q10430120, Q174908, Q1752903, Q1928646, Q2005...",2015-12-30 22:03:39,"During the Vietnam War, you saw what was happe...",mass shooting
8,Adam Winkler,"[Q21288375, Q4679989]",2015-06-22 12:53:07,gun rights and gun control are not only compat...,gun control
9,Budi Waseso,[Q19753012],2015-02-17 12:22:18,The possession of illegal guns is very dangero...,"legal gun, legal gun"
