## Police Data Cleaning

In [1]:
# imports
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Mapping Police Violence-Grid view.csv')
mpv = df[['cause_of_death','race','date','officer_charged','allegedly_armed']]
#renaming "allegedly_armed" to "armed"
mpv = mpv.rename(columns={'allegedly_armed': 'armed'})
mpv

Unnamed: 0,cause_of_death,race,date,officer_charged,armed
0,,Black,3/14/2022,,Allegedly Armed
1,,White,3/12/2022,,Allegedly Armed
2,Gunshot,White,3/12/2022,No Known Charges,Allegedly Armed
3,Gunshot,,3/12/2022,No Known Charges,Allegedly Armed
4,Gunshot,White,3/11/2022,No Known Charges,Allegedly Armed
...,...,...,...,...,...
10070,Gunshot,Hispanic,1/1/2013,No Known Charges,Unclear
10071,Gunshot,Black,1/1/2013,No Known Charges,Allegedly Armed
10072,Gunshot,White,1/1/2013,No Known Charges,Allegedly Armed
10073,Gunshot,Hispanic,1/1/2013,No Known Charges,Allegedly Armed


In [3]:
print(mpv['race'].unique())
print(mpv['cause_of_death'].unique())
print(mpv['armed'].unique())

['Black' 'White' nan 'Hispanic' 'Non-white' 'Unknown race'
 'Pacific Islander' 'Native American' 'Asian']
[nan 'Gunshot' 'Vehicle' 'Gunshot,Taser' 'Other'
 'Physical Restraint,Beaten' 'Taser' 'Beaten' 'Physical Restraint'
 'Taser,Physical Restraint' 'Gunshot,Vehicle' 'Chemical Agent' 'Bean bag'
 'Pepper Spray' 'Asphyxiated' 'Bomb']
['Allegedly Armed' 'Unarmed/Did Not Have Actual Weapon' nan 'Unclear'
 'Vehicle']


In [4]:
print(mpv['officer_charged'].unique())

[nan 'No Known Charges' 'Charged with vehicular homicide'
 'Charged with a crime' '"Charged, Charges Dropped"'
 '"Charged, Convicted of Manslaughter"'
 '"Charged, Convicted, Sentenced to 22.5 years in prison"'
 '"Charged, Convicted"'
 '"Charged, Convicted, Sentenced to life in prison"'
 '"Charged, Convicted, Sentenced to 10 years in prison"'
 '"Charged, Convicted, Sentenced to 7 years in prison"'
 '"Charged, Acquitted"'
 '"Charged, Plead Guilty to Voluntary Manslaughter"'
 '"Charged, Convicted, Sentenced to 1 year in prison and 4 years probation"'
 '"Charged, Convicted, Sentenced to 25 years in prison"'
 '"Charged, Convicted, Sentenced to 2 years suspension and 3 years probation"'
 '"Charged, Convicted, Sentenced to 5-15 years in prison"'
 '"Charged, Convicted, Sentenced to 4.75 years in prison"'
 '"Charged, Mistrial"'
 '"Charged, Convicted, Sentenced to 15 years in prison"'
 '"Charged, Plead No Contest, Sentenced to 10 years probation"'
 '"Charged, Convicted, Sentenced to 6 years in p

In [5]:
#cleaning "officer_charged"
mpv.loc[mpv['officer_charged'].str.contains('Charged') == True, "officer_charged"] = 1
mpv.loc[mpv['officer_charged'] == 'No Known Charges', 'officer_charged'] = 0
mpv

Unnamed: 0,cause_of_death,race,date,officer_charged,armed
0,,Black,3/14/2022,,Allegedly Armed
1,,White,3/12/2022,,Allegedly Armed
2,Gunshot,White,3/12/2022,0,Allegedly Armed
3,Gunshot,,3/12/2022,0,Allegedly Armed
4,Gunshot,White,3/11/2022,0,Allegedly Armed
...,...,...,...,...,...
10070,Gunshot,Hispanic,1/1/2013,0,Unclear
10071,Gunshot,Black,1/1/2013,0,Allegedly Armed
10072,Gunshot,White,1/1/2013,0,Allegedly Armed
10073,Gunshot,Hispanic,1/1/2013,0,Allegedly Armed


In [6]:
print(mpv['officer_charged'].unique())

[nan 0 1]


In [7]:
mpv.dtypes

cause_of_death     object
race               object
date               object
officer_charged    object
armed              object
dtype: object

In [8]:
#why the heck is it object and not int

## Variables

### cause_of_death

Data type: string

Lists cause of death of police victims. Includes: NaN, gunshot, vehicle, taser, beaten, physical restraint, chemical agent, bean bag, pepper spray, asphyxiated, bomb, or multiple causes.

### race

Data type: string

Race of police victim, includes NaN, Black, White, Hispanic, Non-white, Unknown race, Pacific Islander, Native American, or Asian

### date

Data type: datetype

Dates from 1/1/2013 to 10/1/2020 (MM/DD/YY)

### officer_charged

Data type: bool

Whether or not the police officer responsible for violence was crimnally or civilly charged. 
If not charged, False or 0. If charged, True or 1. Otherwise, NaN.

### armed

Originally, allegedly_armed
Data type: int

Whether or not victim was armed.
If unarmed (0), armed (1), unclear (3) otherwise (NaN).

## Twitter Data

In [9]:
import os 
import shutil

We need to combine the multiple csv folders of tweet ids into one big one to then randomlly select tweets to hydrate.

In [10]:
def flatten(directory):
    for dirpath, _, filenames in os.walk(directory, topdown=False):
        for filename in filenames:
            i = 0
            source = os.path.join(dirpath, filename)
            target = os.path.join(directory, filename)

            while os.path.exists(target):
                i += 1
                file_parts = os.path.splitext(os.path.basename(filename))

                target = os.path.join(
                    directory,
                    file_parts[0] + "_" + str(i) + file_parts[1],
                )

            shutil.move(source, target)

            print("Moved ", source, " to ", target)

        if dirpath != directory:
            os.rmdir(dirpath)

            print("Deleted ", dirpath)

#credit to: https://amitd.co/code/python/flatten-a-directory

In [11]:
flatten(os.path.dirname("/Users/dmusa/Documents/UCSD/2021-2022/DDS_BLM/BLM_Tweets/twitter"))

In [12]:
tweets_path = "/Users/dmusa/Documents/UCSD/2021-2022/DDS_BLM/BLM_Tweets/"
file_list = [tweets_path + f for f in os.listdir(tweets_path)]
csv_list = []

FileNotFoundError: [Errno 2] No such file or directory: '/Users/dmusa/Documents/UCSD/2021-2022/DDS_BLM/BLM_Tweets/'

In [None]:
for file in sorted(file_list):
    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))

csv_merged = pd.concat(csv_list, ignore_index=True)
csv_merged.to_csv(tweets_path + 'tweets_id_full.csv', index=False)

Tweet IDs successfully merged into one big CSV ready to be permutated.