## Police Data Cleaning

In [2]:
# imports
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('Mapping Police Violence-Grid view.csv')
mpv = df[['cause_of_death','race','date','officer_charged','allegedly_armed']]
#renaming "allegedly_armed" to "armed"
mpv = mpv.rename(columns={'allegedly_armed': 'armed'})
mpv

Unnamed: 0,cause_of_death,race,date,officer_charged,armed
0,,Black,3/14/2022,,Allegedly Armed
1,,White,3/12/2022,,Allegedly Armed
2,Gunshot,White,3/12/2022,No Known Charges,Allegedly Armed
3,Gunshot,,3/12/2022,No Known Charges,Allegedly Armed
4,Gunshot,White,3/11/2022,No Known Charges,Allegedly Armed
...,...,...,...,...,...
10070,Gunshot,Hispanic,1/1/2013,No Known Charges,Unclear
10071,Gunshot,Black,1/1/2013,No Known Charges,Allegedly Armed
10072,Gunshot,White,1/1/2013,No Known Charges,Allegedly Armed
10073,Gunshot,Hispanic,1/1/2013,No Known Charges,Allegedly Armed


In [4]:
# Converting 'date' column into datatype 'datetime' belonging to pandas

mpv['date'] = pd.to_datetime(mpv['date'], infer_datetime_format=True)

# Can't seem to convert the column dtype to the desired type but the values have been converted successfully
# df['date'] = df['date'].astype('pd._libs.tslibs.timestamps.Timestamp[ns]')

mpv

Unnamed: 0,cause_of_death,race,date,officer_charged,armed
0,,Black,2022-03-14,,Allegedly Armed
1,,White,2022-03-12,,Allegedly Armed
2,Gunshot,White,2022-03-12,No Known Charges,Allegedly Armed
3,Gunshot,,2022-03-12,No Known Charges,Allegedly Armed
4,Gunshot,White,2022-03-11,No Known Charges,Allegedly Armed
...,...,...,...,...,...
10070,Gunshot,Hispanic,2013-01-01,No Known Charges,Unclear
10071,Gunshot,Black,2013-01-01,No Known Charges,Allegedly Armed
10072,Gunshot,White,2013-01-01,No Known Charges,Allegedly Armed
10073,Gunshot,Hispanic,2013-01-01,No Known Charges,Allegedly Armed


In [5]:
type(mpv['date'].iloc[0])

pandas._libs.tslibs.timestamps.Timestamp

In [17]:
# Removing incidents occuring after 2020-09 as we do not have tweets after that period
cutoff_date = pd.Timestamp(2020, 10, 1)

mpv = mpv[mpv.date < cutoff_date]

# Setting date as index
mpv.set_index('date')

Unnamed: 0_level_0,cause_of_death,race,officer_charged,armed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-09-30,Gunshot,White,0,Allegedly Armed
2020-09-30,Gunshot,Black,0,Unclear
2020-09-29,Gunshot,Unknown race,0,Allegedly Armed
2020-09-28,Gunshot,Black,0,Allegedly Armed
2020-09-28,Gunshot,Hispanic,0,Allegedly Armed
...,...,...,...,...
2013-01-01,Gunshot,Hispanic,0,Unclear
2013-01-01,Gunshot,Black,0,Allegedly Armed
2013-01-01,Gunshot,White,0,Allegedly Armed
2013-01-01,Gunshot,Hispanic,0,Allegedly Armed


In [7]:
print(mpv['race'].unique())
print(mpv['cause_of_death'].unique())
print(mpv['armed'].unique())

['Unknown race' 'White' 'Black' 'Hispanic' 'Asian' 'Native American' nan
 'Pacific Islander']
['Gunshot' 'Gunshot,Taser' 'Taser' 'Vehicle' 'Physical Restraint'
 'Chemical Agent' 'Beaten' 'Other' 'Bean bag' 'Pepper Spray' 'Asphyxiated'
 'Bomb' 'Taser,Physical Restraint' nan 'Gunshot,Vehicle']
['Allegedly Armed' 'Unclear' 'Unarmed/Did Not Have Actual Weapon'
 'Vehicle' nan]


In [8]:
print(mpv['officer_charged'].unique())

['No Known Charges' '"Charged, Charges Dropped"' 'Charged with a crime'
 nan '"Charged, Convicted, Sentenced to 22.5 years in prison"'
 '"Charged, Convicted"'
 '"Charged, Convicted, Sentenced to life in prison"'
 '"Charged, Convicted, Sentenced to 10 years in prison"'
 '"Charged, Convicted, Sentenced to 7 years in prison"'
 '"Charged, Acquitted"'
 '"Charged, Plead Guilty to Voluntary Manslaughter"'
 '"Charged, Convicted, Sentenced to 1 year in prison and 4 years probation"'
 '"Charged, Convicted, Sentenced to 25 years in prison"'
 '"Charged, Convicted, Sentenced to 2 years suspension and 3 years probation"'
 '"Charged, Convicted, Sentenced to 5-15 years in prison"'
 '"Charged, Convicted, Sentenced to 4.75 years in prison"'
 '"Charged, Mistrial"'
 '"Charged, Convicted, Sentenced to 15 years in prison"'
 '"Charged, Plead No Contest, Sentenced to 10 years probation"'
 '"Charged, Convicted, Sentenced to 6 years in prison"'
 '"Charged, Convicted, Sentenced to 10 Years of Prison"'
 '"Charged

In [9]:
#cleaning "officer_charged"
mpv.loc[mpv['officer_charged'].str.contains('Charged') == True, "officer_charged"] = 1
mpv.loc[mpv['officer_charged'] == 'No Known Charges', 'officer_charged'] = 0
mpv

Unnamed: 0,cause_of_death,race,date,officer_charged,armed
1601,Gunshot,Unknown race,2020-10-01,0,Allegedly Armed
1602,Gunshot,White,2020-09-30,0,Allegedly Armed
1603,Gunshot,Black,2020-09-30,0,Unclear
1604,Gunshot,Unknown race,2020-09-29,0,Allegedly Armed
1605,Gunshot,Black,2020-09-28,0,Allegedly Armed
...,...,...,...,...,...
10070,Gunshot,Hispanic,2013-01-01,0,Unclear
10071,Gunshot,Black,2013-01-01,0,Allegedly Armed
10072,Gunshot,White,2013-01-01,0,Allegedly Armed
10073,Gunshot,Hispanic,2013-01-01,0,Allegedly Armed


In [10]:
print(mpv['officer_charged'].unique())

[0 1 nan]


In [11]:
mpv.dtypes

cause_of_death             object
race                       object
date               datetime64[ns]
officer_charged            object
armed                      object
dtype: object

In [12]:
#why the heck is it object and not int

## Variables

### Victim's Cause of Death

Variable name: cause_of_death

Data type: string

Lists cause of death of police victims. Includes: NaN, gunshot, vehicle, taser, beaten, physical restraint, chemical agent, bean bag, pepper spray, asphyxiated, bomb, or multiple causes.

### Race of Victim

Variable name: race

Data type: string

Race of police victim, includes NaN, Black, White, Hispanic, Non-white, Unknown race, Pacific Islander, Native American, or Asian

### Date of Encounter

Variable name: date

Data type: pandas Timestamp

Dates from 2013-01-01 to 2020-09-30 (YYYY-MM-DD)

### Officer Charged

Variable name: officer_charged

Data type: bool

Whether or not the police officer responsible for violence was crimnally or civilly charged. 
If not charged, False or 0. If charged, True or 1. Otherwise, NaN.

### Victim Armed

Variable name (in original data): allegedly_armed

Variable name (in our dataframe): armed

Data type: int

Whether or not victim was armed.
If unarmed (0), armed (1), unclear (3) otherwise (NaN).

## Twitter Data

In [13]:
import os 
import shutil

We need to combine the multiple csv folders of tweet ids into one big one to then randomlly select tweets to hydrate.

In [14]:
def flatten(directory):
    for dirpath, _, filenames in os.walk(directory, topdown=False):
        for filename in filenames:
            i = 0
            source = os.path.join(dirpath, filename)
            target = os.path.join(directory, filename)

            while os.path.exists(target):
                i += 1
                file_parts = os.path.splitext(os.path.basename(filename))

                target = os.path.join(
                    directory,
                    file_parts[0] + "_" + str(i) + file_parts[1],
                )

            shutil.move(source, target)

            print("Moved ", source, " to ", target)

        if dirpath != directory:
            os.rmdir(dirpath)

            print("Deleted ", dirpath)

#credit to: https://amitd.co/code/python/flatten-a-directory

In [15]:
flatten(os.path.dirname("/Users/dmusa/Documents/UCSD/2021-2022/DDS_BLM/BLM_Tweets/twitter"))

In [16]:
tweets_path = "/Users/dmusa/Documents/UCSD/2021-2022/DDS_BLM/BLM_Tweets/"
file_list = [tweets_path + f for f in os.listdir(tweets_path)]
csv_list = []

FileNotFoundError: [WinError 3] The system cannot find the path specified: '/Users/dmusa/Documents/UCSD/2021-2022/DDS_BLM/BLM_Tweets/'

In [None]:
for file in sorted(file_list):
    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))

csv_merged = pd.concat(csv_list, ignore_index=True)
csv_merged.to_csv(tweets_path + 'tweets_id_full.csv', index=False)

Tweet IDs successfully merged into one big CSV ready to be permutated.