## Police Data Cleaning

In [2]:
# imports
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('Mapping Police Violence-Grid view.csv')
mpv = df[['cause_of_death','race','date','officer_charged','allegedly_armed']]
#renaming "allegedly_armed" to "armed"
mpv = mpv.rename(columns={'allegedly_armed': 'armed'})
mpv

Unnamed: 0,cause_of_death,race,date,officer_charged,armed
0,,Black,3/14/2022,,Allegedly Armed
1,,White,3/12/2022,,Allegedly Armed
2,Gunshot,White,3/12/2022,No Known Charges,Allegedly Armed
3,Gunshot,,3/12/2022,No Known Charges,Allegedly Armed
4,Gunshot,White,3/11/2022,No Known Charges,Allegedly Armed
...,...,...,...,...,...
10070,Gunshot,Hispanic,1/1/2013,No Known Charges,Unclear
10071,Gunshot,Black,1/1/2013,No Known Charges,Allegedly Armed
10072,Gunshot,White,1/1/2013,No Known Charges,Allegedly Armed
10073,Gunshot,Hispanic,1/1/2013,No Known Charges,Allegedly Armed


In [23]:
print(mpv['race'].unique())
print(mpv['cause_of_death'].unique())
print(mpv['armed'].unique())

['Black' 'White' nan 'Hispanic' 'Non-white' 'Unknown race'
 'Pacific Islander' 'Native American' 'Asian']
[nan 'Gunshot' 'Vehicle' 'Gunshot,Taser' 'Other'
 'Physical Restraint,Beaten' 'Taser' 'Beaten' 'Physical Restraint'
 'Taser,Physical Restraint' 'Gunshot,Vehicle' 'Chemical Agent' 'Bean bag'
 'Pepper Spray' 'Asphyxiated' 'Bomb']
['Allegedly Armed' 'Unarmed/Did Not Have Actual Weapon' nan 'Unclear'
 'Vehicle']


In [22]:
mpv['armed'].value_counts()

Allegedly Armed                       7185
Unarmed/Did Not Have Actual Weapon    1329
Unclear                                912
Vehicle                                635
Name: armed, dtype: int64

In [21]:
mpv['cause_of_death'].value_counts()
#proportion of gunshot vs taser, assign to gunshot then

Gunshot                      9312
Taser                         292
Gunshot,Taser                 287
Vehicle                        59
Physical Restraint             53
Beaten                         33
Asphyxiated                     9
Other                           7
Pepper Spray                    5
Taser,Physical Restraint        4
Gunshot,Vehicle                 2
Bean bag                        2
Physical Restraint,Beaten       1
Bomb                            1
Chemical Agent                  1
Name: cause_of_death, dtype: int64

In [53]:
print(mpv['officer_charged'].unique())

[nan 'No Known Charges' 'Charged with vehicular homicide'
 'Charged with a crime' '"Charged, Charges Dropped"'
 '"Charged, Convicted of Manslaughter"'
 '"Charged, Convicted, Sentenced to 22.5 years in prison"'
 '"Charged, Convicted"'
 '"Charged, Convicted, Sentenced to life in prison"'
 '"Charged, Convicted, Sentenced to 10 years in prison"'
 '"Charged, Convicted, Sentenced to 7 years in prison"'
 '"Charged, Acquitted"'
 '"Charged, Plead Guilty to Voluntary Manslaughter"'
 '"Charged, Convicted, Sentenced to 1 year in prison and 4 years probation"'
 '"Charged, Convicted, Sentenced to 25 years in prison"'
 '"Charged, Convicted, Sentenced to 2 years suspension and 3 years probation"'
 '"Charged, Convicted, Sentenced to 5-15 years in prison"'
 '"Charged, Convicted, Sentenced to 4.75 years in prison"'
 '"Charged, Mistrial"'
 '"Charged, Convicted, Sentenced to 15 years in prison"'
 '"Charged, Plead No Contest, Sentenced to 10 years probation"'
 '"Charged, Convicted, Sentenced to 6 years in p

In [8]:
#cleaning "officer_charged"
mpv.loc[mpv['officer_charged'].str.contains('Charged') == True, "officer_charged"] = 1.0
mpv.loc[mpv['officer_charged'] == 'No Known Charges', 'officer_charged'] = 0.0
mpv

Unnamed: 0,cause_of_death,race,date,officer_charged,armed
0,,Black,3/14/2022,,Allegedly Armed
1,,White,3/12/2022,,Allegedly Armed
2,Gunshot,White,3/12/2022,0.0,Allegedly Armed
3,Gunshot,,3/12/2022,0.0,Allegedly Armed
4,Gunshot,White,3/11/2022,0.0,Allegedly Armed
...,...,...,...,...,...
10070,Gunshot,Hispanic,1/1/2013,0.0,Unclear
10071,Gunshot,Black,1/1/2013,0.0,Allegedly Armed
10072,Gunshot,White,1/1/2013,0.0,Allegedly Armed
10073,Gunshot,Hispanic,1/1/2013,0.0,Allegedly Armed


In [15]:
print(mpv['officer_charged'].unique())
type(mpv['officer_charged'].iloc[0])
mpv['officer_charged'] = mpv['officer_charged'].astype(float)

[nan 0.0 1.0]


In [17]:
print(mpv['officer_charged'].unique())

[nan  0.  1.]


In [18]:
mpv.dtypes
#probably nan

cause_of_death      object
race                object
date                object
officer_charged    float64
armed               object
dtype: object

In [65]:
#why the heck is it object and not int

## Variables

### cause_of_death

Data type: string

Lists cause of death of police victims. Includes: NaN, gunshot, vehicle, taser, beaten, physical restraint, chemical agent, bean bag, pepper spray, asphyxiated, bomb, or multiple causes.

### race

Data type: string

Race of police victim, includes NaN, Black, White, Hispanic, Non-white, Unknown race, Pacific Islander, Native American, or Asian

### date

Data type: datetype

Dates from 1/1/2013 to 10/1/2020 (MM/DD/YY)

### officer_charged

Data type: bool

Whether or not the police officer responsible for violence was crimnally or civilly charged. 
If not charged, False or 0. If charged, True or 1. Otherwise, NaN.

### armed

Originally, allegedly_armed
Data type: int

Whether or not victim was armed.
If unarmed (0), armed (1), unclear (3) otherwise (NaN).

## Twitter Data

In [2]:
import os 
import shutil
import numpy as np
import pandas as pd

We need to combine the multiple csv folders of tweet ids into one big one to then randomlly select tweets to hydrate.

In [2]:
def flatten(directory):
    for dirpath, _, filenames in os.walk(directory, topdown=False):
        for filename in filenames:
            i = 0
            source = os.path.join(dirpath, filename)
            target = os.path.join(directory, filename)

            while os.path.exists(target):
                i += 1
                file_parts = os.path.splitext(os.path.basename(filename))

                target = os.path.join(
                    directory,
                    file_parts[0] + "_" + str(i) + file_parts[1],
                )

            shutil.move(source, target)

            print("Moved ", source, " to ", target)

        if dirpath != directory:
            os.rmdir(dirpath)

            print("Deleted ", dirpath)

#credit to: https://amitd.co/code/python/flatten-a-directory

In [3]:
flatten(os.path.dirname("/Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM/twitter"))

Moved  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\twitter\2013\2013-01.csv  to  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\2013-01.csv
Moved  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\twitter\2013\2013-02.csv  to  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\2013-02.csv
Moved  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\twitter\2013\2013-03.csv  to  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\2013-03.csv
Moved  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\twitter\2013\2013-04.csv  to  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\2013-04.csv
Moved  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\twitter\2013\2013-05.csv  to  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\2013-05.csv
Moved  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\twitter\2013\2013-06.csv  to  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\2013-06.csv
Moved  /Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM\twitter\2013\2013-07.csv  to  /Users/dmusa/Documents/UCSD/2021

Flattened all folders to make combining into one CSV easier.

In [6]:
tweets_path = "/Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM/"
file_list = [tweets_path + f for f in os.listdir(tweets_path)]
csv_list = []

In [10]:
for file in sorted(file_list):
    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))

csv_merged = pd.concat(csv_list, ignore_index=True)
csv_merged.to_csv(tweets_path + 'tweets_full.csv', index=False)

Tweet IDs successfully merged into one big CSV ready to be permutated.

In [13]:
tweets = pd.read_csv('/Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM/tweets_full.csv')
tweets.head()

Unnamed: 0,status_id,blacklivesmatter,alllivesmatter,bluelivesmatter,File_Name
0,296004745028567040,1.0,0.0,0.0,2013-01.csv
1,299022201141735424,0.0,1.0,0.0,2013-02.csv
2,316973754951540737,1.0,0.0,0.0,2013-03.csv
3,315462982803009536,1.0,0.0,0.0,2013-03.csv
4,316242983945129984,1.0,0.0,0.0,2013-03.csv


In [27]:
tweets_sample = tweets.sample(n = 2000000)
tweets_sample.head()

Unnamed: 0,status_id,blacklivesmatter,alllivesmatter,bluelivesmatter,File_Name
74521622,1174068844677058561,1.0,0.0,1.0,2019-09.csv
75450127,1251960723464286209,0.0,0.0,1.0,2020-04.csv
93429558,1270959408730697728,1.0,0.0,0.0,2020-06.csv
58148809,1337235948581629952,1.0,0.0,0.0,2020-12.csv
21257809,1269019273269313543,1.0,0.0,0.0,2020-06.csv


Randomly selected 2,000,000 tweet IDs. Now, we need to export back to CSV file with just the IDs to hydrate the tweets.

In [28]:
tweets_sample_id = tweets_sample['status_id']

In [30]:
tweets_sample_id.to_csv('/Users/dmusa/Documents/UCSD/2021-2022/Tweets_BLM/tweets_sample_id.csv', header=False, index=False)

Now that we have the tweets, let's go over the variables in the dataframe. 

The creators of the dataset categorized each tweet as supporting one of three major movements: Black Lives Matter, All Lives Matter, and Blue Lives Matter. The tweets are one hot encoded. 

The used an assortment of keywords and hashtags, as shown below, to categorize the tweets. These terms have a weight. The weight indicates how unique a term is to the topic. So a "1.0" tells us this term showed up only in that topic.

In [4]:
alm = pd.read_csv('/Users/dmusa/Documents/UCSD/2021-2022/BLM_Tweets/topics/alm50.csv')
alm = alm[['term', 'weight']]
alm.sort_values(by=['weight'],ascending=False)
alm.loc[alm['weight'] == 1.0]

Unnamed: 0,term,weight
8,#bible,1.0
32,#ilovejesus,1.0
38,#lfj,1.0
57,#prayergift,1.0
70,#taniaspeaks,1.0
...,...,...
14471,gsr,1.0
14485,justice4cephus,1.0
14489,misconstrues,1.0
14499,stippling,1.0


In [5]:
blm = pd.read_csv('/Users/dmusa/Documents/UCSD/2021-2022/BLM_Tweets/topics/blm100.csv')
blm = blm[['term', 'weight']]
blm.head()
blm.loc[blm['weight'] == 1.0]

Unnamed: 0,term,weight
33,application,1.0
3072,luther,1.0
4579,#russelllee,1.0
9064,archive,1.0
9306,barack,1.0
...,...,...
44566,walter,1.0
44611,#kohkanta,1.0
44626,#sb50,1.0
45471,#blackqueen,1.0


In [6]:
blulm = pd.read_csv('/Users/dmusa/Documents/UCSD/2021-2022/BLM_Tweets/topics/blulm25.csv')
blulm = blulm[['term', 'weight']]
blulm.head()
blulm.loc[blulm['weight'] == 1.0]

Unnamed: 0,term,weight
0,#axlesgarage,1.0
8,#ghetto,1.0
10,#igotyour6,1.0
13,#jeepwrangler,1.0
14,#lawenforcementofficers,1.0
...,...,...
10025,junior,1.0
10029,leslie,1.0
10043,pearce,1.0
10075,wade,1.0
