In [2]:
import pandas as pd
import numpy as np

### 📂 Datei einlesen 

In [3]:
df = pd.read_csv("/Users/david/Desktop/dataplus/PROJEKTWOCHE 21.05.-03.06./data/2025-05-21-washington-post-police-shootings-export.csv")

In [4]:
df

Unnamed: 0,date,name,age,gender,armed,race,city,state,flee,body_camera,signs_of_mental_illness,police_departments_involved
0,2015-01-02,Lewis Lee Lembke,47.0,male,gun,White,Aloha,OR,not,False,False,"Washington County Sheriff's Office, OR"
1,2015-01-02,Tim Elliot,53.0,male,gun,Asian,Shelton,WA,not,False,True,"Mason County Sheriff's Office, WA"
2,2015-01-03,John Paul Quintero,23.0,male,unarmed,Hispanic,Wichita,KS,not,False,False,"Wichita Police Department, KS"
3,2015-01-04,Kenneth Joe Brown,18.0,male,gun,White,Guthrie,OK,not,False,False,"Oklahoma Highway Patrol, OK"
4,2015-01-04,Michael Rodriguez,39.0,male,other,Hispanic,Evans,CO,not,False,False,"Evans Police Department, CO"
...,...,...,...,...,...,...,...,...,...,...,...,...
10425,2024-12-30,Timothy Woods,26.0,male,gun,Black,Orlando,FL,foot,False,False,"Orlando Police Department, FL"
10426,2024-12-30,,38.0,male,knife,Unknown,Sacramento,CA,,True,True,"Sacramento County Sheriff's Department, CA"
10427,2024-12-30,Kenneth Thaddeus Roberts Jr.,48.0,male,gun,Black,Indianapolis,IN,car,False,False,"Indianapolis Metropolitan Police Department, I..."
10428,2024-12-31,Moses Alik,22.0,male,knife,Unknown,Celina,OH,,False,False,"Mercer County Sheriff's Office, OH"


### 🕒 Datum konvertieren

In [5]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')


### 📍 Jahr extrahieren für spätere Filter/Plots


In [6]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month 


### 🔎 Überblick: Spalten & Datentypen


In [7]:
print("Spalten & Datentypen:")
print(df.dtypes)

Spalten & Datentypen:
date                           datetime64[ns]
name                                   object
age                                   float64
gender                                 object
armed                                  object
race                                   object
city                                   object
state                                  object
flee                                   object
body_camera                              bool
signs_of_mental_illness                  bool
police_departments_involved            object
year                                    int32
month                                   int32
dtype: object


### 👀 Erste fünf Zeilen


In [8]:
print("\nVorschau auf die Daten:")
print(df.head())


Vorschau auf die Daten:
        date                name   age gender    armed      race     city  \
0 2015-01-02    Lewis Lee Lembke  47.0   male      gun     White    Aloha   
1 2015-01-02          Tim Elliot  53.0   male      gun     Asian  Shelton   
2 2015-01-03  John Paul Quintero  23.0   male  unarmed  Hispanic  Wichita   
3 2015-01-04   Kenneth Joe Brown  18.0   male      gun     White  Guthrie   
4 2015-01-04   Michael Rodriguez  39.0   male    other  Hispanic    Evans   

  state flee  body_camera  signs_of_mental_illness  \
0    OR  not        False                    False   
1    WA  not        False                     True   
2    KS  not        False                    False   
3    OK  not        False                    False   
4    CO  not        False                    False   

              police_departments_involved  year  month  
0  Washington County Sheriff's Office, OR  2015      1  
1       Mason County Sheriff's Office, WA  2015      1  
2           Wich

### 🧼 Anzahl fehlender Werte pro Spalte

In [9]:
print("\nAnzahl fehlender Werte:")
print(df.isnull().sum())


Anzahl fehlender Werte:
date                              0
name                            318
age                             372
gender                           20
armed                           211
race                              0
city                             74
state                             0
flee                           1493
body_camera                       0
signs_of_mental_illness           0
police_departments_involved       1
year                              0
month                             0
dtype: int64


### 🏷️ Einzigartige Werte in wichtigen Spalten checken

In [10]:
print("\nEthnien:", df['race'].unique())
print("Bodycam:", df['body_camera'].unique())
print("Bewaffnet:", df['armed'].unique())
print("Mentale Störung:", df['signs_of_mental_illness'].unique())
print("Fluchtverhalten:", df['flee'].unique())


Ethnien: ['White' 'Asian' 'Hispanic' 'Black' 'Other' 'Unknown' 'Native American'
 'White,Black,Native American' 'Native American,Hispanic' 'White,Hispanic'
 'Black,Hispanic' 'White,Black' 'White,Asian']
Bodycam: [False  True]
Bewaffnet: ['gun' 'unarmed' 'other' 'replica' 'knife' 'blunt_object' nan 'vehicle'
 'undetermined' 'other,gun' 'unknown' 'blunt_object,blunt_object'
 'gun,knife' 'knife,blunt_object' 'vehicle,gun' 'gun,vehicle'
 'replica,vehicle' 'blunt_object,knife' 'knife,vehicle'
 'vehicle,knife,other' 'knife,knife' 'replica,knife'
 'other,blunt_object,knife' 'other,knife' 'vehicle,knife' 'gun,other'
 'blunt_object,other' 'knife,replica' 'knife,unknown'
 'replica,blunt_object' 'blunt_object,gun']
Mentale Störung: [False  True]
Fluchtverhalten: ['not' 'car' 'foot' 'other' nan]


### "Unknown" bei Ethnie entfernen


In [11]:
df_clean = df[df['race'] != 'Unknown']


### flee-Spalte vereinheitlichen → derzeit steht  'not', aber vermutlich gemeint als "Not fleeing"

In [12]:
df['flee'] = df['flee'].replace({
    'not': 'Not fleeing',
    'foot': 'Fleeing - on foot',
    'car': 'Fleeing - in vehicle',
    'other': 'Fleeing - other',
    np.nan: 'Unknown'
})

### Neue Spalte hinzufügen 'armed_simple' für 'armed' oder 'unarmed' zur Vereinfachung

In [13]:
import numpy as np

def classify_armed_status(value):
    if pd.isna(value):
        return 'unknown'
    value = str(value).lower()
    if 'unarmed' in value:
        return 'unarmed'
    elif value in ['no', 'none', 'n/a', 'unknown']:
        return 'unknown'
    else:
        return 'armed'

df['armed_simple'] = df['armed'].apply(classify_armed_status)
df

Unnamed: 0,date,name,age,gender,armed,race,city,state,flee,body_camera,signs_of_mental_illness,police_departments_involved,year,month,armed_simple
0,2015-01-02,Lewis Lee Lembke,47.0,male,gun,White,Aloha,OR,Not fleeing,False,False,"Washington County Sheriff's Office, OR",2015,1,armed
1,2015-01-02,Tim Elliot,53.0,male,gun,Asian,Shelton,WA,Not fleeing,False,True,"Mason County Sheriff's Office, WA",2015,1,armed
2,2015-01-03,John Paul Quintero,23.0,male,unarmed,Hispanic,Wichita,KS,Not fleeing,False,False,"Wichita Police Department, KS",2015,1,unarmed
3,2015-01-04,Kenneth Joe Brown,18.0,male,gun,White,Guthrie,OK,Not fleeing,False,False,"Oklahoma Highway Patrol, OK",2015,1,armed
4,2015-01-04,Michael Rodriguez,39.0,male,other,Hispanic,Evans,CO,Not fleeing,False,False,"Evans Police Department, CO",2015,1,armed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10425,2024-12-30,Timothy Woods,26.0,male,gun,Black,Orlando,FL,Fleeing - on foot,False,False,"Orlando Police Department, FL",2024,12,armed
10426,2024-12-30,,38.0,male,knife,Unknown,Sacramento,CA,Unknown,True,True,"Sacramento County Sheriff's Department, CA",2024,12,armed
10427,2024-12-30,Kenneth Thaddeus Roberts Jr.,48.0,male,gun,Black,Indianapolis,IN,Fleeing - in vehicle,False,False,"Indianapolis Metropolitan Police Department, I...",2024,12,armed
10428,2024-12-31,Moses Alik,22.0,male,knife,Unknown,Celina,OH,Unknown,False,False,"Mercer County Sheriff's Office, OH",2024,12,armed


### fehlende Zeilen in armed/unarmed entfernen

In [14]:
df_clean = df[df['armed_simple'].isin(['armed', 'unarmed'])]
df_clean

Unnamed: 0,date,name,age,gender,armed,race,city,state,flee,body_camera,signs_of_mental_illness,police_departments_involved,year,month,armed_simple
0,2015-01-02,Lewis Lee Lembke,47.0,male,gun,White,Aloha,OR,Not fleeing,False,False,"Washington County Sheriff's Office, OR",2015,1,armed
1,2015-01-02,Tim Elliot,53.0,male,gun,Asian,Shelton,WA,Not fleeing,False,True,"Mason County Sheriff's Office, WA",2015,1,armed
2,2015-01-03,John Paul Quintero,23.0,male,unarmed,Hispanic,Wichita,KS,Not fleeing,False,False,"Wichita Police Department, KS",2015,1,unarmed
3,2015-01-04,Kenneth Joe Brown,18.0,male,gun,White,Guthrie,OK,Not fleeing,False,False,"Oklahoma Highway Patrol, OK",2015,1,armed
4,2015-01-04,Michael Rodriguez,39.0,male,other,Hispanic,Evans,CO,Not fleeing,False,False,"Evans Police Department, CO",2015,1,armed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10425,2024-12-30,Timothy Woods,26.0,male,gun,Black,Orlando,FL,Fleeing - on foot,False,False,"Orlando Police Department, FL",2024,12,armed
10426,2024-12-30,,38.0,male,knife,Unknown,Sacramento,CA,Unknown,True,True,"Sacramento County Sheriff's Department, CA",2024,12,armed
10427,2024-12-30,Kenneth Thaddeus Roberts Jr.,48.0,male,gun,Black,Indianapolis,IN,Fleeing - in vehicle,False,False,"Indianapolis Metropolitan Police Department, I...",2024,12,armed
10428,2024-12-31,Moses Alik,22.0,male,knife,Unknown,Celina,OH,Unknown,False,False,"Mercer County Sheriff's Office, OH",2024,12,armed


In [15]:
print("\nEthnien:", df['race'].unique())



Ethnien: ['White' 'Asian' 'Hispanic' 'Black' 'Other' 'Unknown' 'Native American'
 'White,Black,Native American' 'Native American,Hispanic' 'White,Hispanic'
 'Black,Hispanic' 'White,Black' 'White,Asian']


In [16]:
df_2024 = df[df['date'].dt.year == 2024]
print(df_2024)

            date                          name   age  gender    armed  \
9255  2024-01-01              Sidney Tafokitau  44.0    male      gun   
9256  2024-01-01           Katelynn Rose Smith  29.0  female      gun   
9257  2024-01-01       Victor Figueroa Roblero  48.0    male  unarmed   
9258  2024-01-01           Aaron Travis Watson  35.0    male      gun   
9259  2024-01-03              Rakim A. Tillery  35.0    male      gun   
...          ...                           ...   ...     ...      ...   
10425 2024-12-30                 Timothy Woods  26.0    male      gun   
10426 2024-12-30                           NaN  38.0    male    knife   
10427 2024-12-30  Kenneth Thaddeus Roberts Jr.  48.0    male      gun   
10428 2024-12-31                    Moses Alik  22.0    male    knife   
10429 2024-12-31           James Junior Holder  41.0    male      gun   

           race          city state                  flee  body_camera  \
9255      Other      Honolulu    HI           Not

In [17]:
print("\nEthnien:", df['race'].unique())
df_race_other = df[df['race'] == 'Other']
df_race_other

df_race_unknown =df[df['race'] == 'Unknown']
df_race_unknown



Ethnien: ['White' 'Asian' 'Hispanic' 'Black' 'Other' 'Unknown' 'Native American'
 'White,Black,Native American' 'Native American,Hispanic' 'White,Hispanic'
 'Black,Hispanic' 'White,Black' 'White,Asian']


Unnamed: 0,date,name,age,gender,armed,race,city,state,flee,body_camera,signs_of_mental_illness,police_departments_involved,year,month,armed_simple
58,2015-01-25,William Campbell,59.0,male,gun,Unknown,Winslow,NJ,Not fleeing,False,False,"Winslow Police Department, NJ",2015,1,armed
241,2015-03-30,John Marcell Allen,54.0,male,gun,Unknown,Boulder City,NV,Not fleeing,False,False,"Boulder City Police Department, NV",2015,3,armed
343,2015-05-07,Joseph Hilton Roy,72.0,male,knife,Unknown,Lawrenceville,GA,Not fleeing,False,True,"Gwinnett County Police Department, GA",2015,5,armed
400,2015-05-31,James Anthony Morris,40.0,male,gun,Unknown,Medford,OR,Not fleeing,False,True,"Medford Police Department, OR",2015,5,armed
420,2015-06-08,James Johnson,54.0,male,gun,Unknown,Beech Grove,IN,Not fleeing,False,True,"Beech Grove Police Department, IN",2015,6,armed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10423,2024-12-29,Jordan Parisien,32.0,male,undetermined,Unknown,Belcourt,ND,Unknown,False,True,"Bureau of Indian Affairs, ND",2024,12,armed
10424,2024-12-30,Nathan Paul,43.0,male,unknown,Unknown,West Point,UT,Unknown,False,True,"Clinton Police Department, UT",2024,12,unknown
10426,2024-12-30,,38.0,male,knife,Unknown,Sacramento,CA,Unknown,True,True,"Sacramento County Sheriff's Department, CA",2024,12,armed
10428,2024-12-31,Moses Alik,22.0,male,knife,Unknown,Celina,OH,Unknown,False,False,"Mercer County Sheriff's Office, OH",2024,12,armed


In [18]:
df_clean
df_clean.to_excel("output.xlsx")

In [19]:
df_clean = df[df['gender'] != 'Unknown']
df_clean

Unnamed: 0,date,name,age,gender,armed,race,city,state,flee,body_camera,signs_of_mental_illness,police_departments_involved,year,month,armed_simple
0,2015-01-02,Lewis Lee Lembke,47.0,male,gun,White,Aloha,OR,Not fleeing,False,False,"Washington County Sheriff's Office, OR",2015,1,armed
1,2015-01-02,Tim Elliot,53.0,male,gun,Asian,Shelton,WA,Not fleeing,False,True,"Mason County Sheriff's Office, WA",2015,1,armed
2,2015-01-03,John Paul Quintero,23.0,male,unarmed,Hispanic,Wichita,KS,Not fleeing,False,False,"Wichita Police Department, KS",2015,1,unarmed
3,2015-01-04,Kenneth Joe Brown,18.0,male,gun,White,Guthrie,OK,Not fleeing,False,False,"Oklahoma Highway Patrol, OK",2015,1,armed
4,2015-01-04,Michael Rodriguez,39.0,male,other,Hispanic,Evans,CO,Not fleeing,False,False,"Evans Police Department, CO",2015,1,armed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10425,2024-12-30,Timothy Woods,26.0,male,gun,Black,Orlando,FL,Fleeing - on foot,False,False,"Orlando Police Department, FL",2024,12,armed
10426,2024-12-30,,38.0,male,knife,Unknown,Sacramento,CA,Unknown,True,True,"Sacramento County Sheriff's Department, CA",2024,12,armed
10427,2024-12-30,Kenneth Thaddeus Roberts Jr.,48.0,male,gun,Black,Indianapolis,IN,Fleeing - in vehicle,False,False,"Indianapolis Metropolitan Police Department, I...",2024,12,armed
10428,2024-12-31,Moses Alik,22.0,male,knife,Unknown,Celina,OH,Unknown,False,False,"Mercer County Sheriff's Office, OH",2024,12,armed


In [20]:
df_clean
df_cleancurrent = df_clean.dropna()
df_cleancurrent.to_excel("output.xlsx")

### Spalte 'race' weiter vereinfachen: mehr als eine Ethnie wird zu 'multi/other'

In [21]:
def simplify_race(value):
    if isinstance(value, str):
        value = value.strip()
        if value.lower() == "unknown" or value == "":
            return "Unknown"
        elif "," in value:
            return "Multi/Other"
        else:
            return value
    else:
        return "Unknown"


In [22]:
df['race_clean'] = df['race'].apply(simplify_race)


In [23]:
print("Fehlende Werte:", df['race_clean'].isna().sum())
print(df['race_clean'].value_counts(dropna=False))


Fehlende Werte: 0
race_clean
White              4657
Black              2484
Hispanic           1717
Unknown            1192
Asian               184
Native American     146
Other                37
Multi/Other          13
Name: count, dtype: int64


In [24]:
df['race_clean'] = df['race_clean'].replace({'Other': 'Multi/Other'})


In [25]:
df['race_clean'].value_counts()


race_clean
White              4657
Black              2484
Hispanic           1717
Unknown            1192
Asian               184
Native American     146
Multi/Other          50
Name: count, dtype: int64

### Ditionary für Vergleiche mit Gesamtbevölkerung gemessen an Census von 2020

In [34]:
population_shares = {
    'White': 0.583,              # White alone
    'Black': 0.136,              # Black alone
    'Hispanic': 0.193,           # Näherung über "Some Other Race"
    'Asian': 0.063,              # Asian alone
    'Native American': 0.012,    # American Indian and Alaska Native
    'Multi/Other': 0.013,        # Mixed/Other (inkl. Pacific, multi-race)
    'Unknown': None              # Keine Vergleichszahl
}


In [35]:
victim_shares = df['race_clean'].value_counts(normalize=True)

print("📊 Vergleich Opferanteil vs. Bevölkerungsanteil:\n")
for group, pop_share in population_shares.items():
    victim_share = victim_shares.get(group, 0)
    if pop_share is not None:
        ratio = victim_share / pop_share
        print(f"{group:<16} | Opfer: {victim_share:.1%} | Bevölkerung: {pop_share:.1%} → Verhältnis: {ratio:.2f}")
    else:
        print(f"{group:<16} | Opfer: {victim_share:.1%} | Bevölkerung: —      → Verhältnis: n/a")


📊 Vergleich Opferanteil vs. Bevölkerungsanteil:

White            | Opfer: 44.7% | Bevölkerung: 58.3% → Verhältnis: 0.77
Black            | Opfer: 23.8% | Bevölkerung: 13.6% → Verhältnis: 1.75
Hispanic         | Opfer: 16.5% | Bevölkerung: 19.3% → Verhältnis: 0.85
Asian            | Opfer: 1.8% | Bevölkerung: 6.3% → Verhältnis: 0.28
Native American  | Opfer: 1.4% | Bevölkerung: 1.2% → Verhältnis: 1.17
Multi/Other      | Opfer: 0.5% | Bevölkerung: 1.3% → Verhältnis: 0.37
Unknown          | Opfer: 11.4% | Bevölkerung: —      → Verhältnis: n/a


In [36]:
victim_shares

race_clean
White              0.446500
Black              0.238159
Hispanic           0.164621
Unknown            0.114286
Asian              0.017641
Native American    0.013998
Multi/Other        0.004794
Name: proportion, dtype: float64

In [37]:
df

Unnamed: 0,date,name,age,gender,armed,race,city,state,flee,body_camera,signs_of_mental_illness,police_departments_involved,year,month,armed_simple,race_clean
0,2015-01-02,Lewis Lee Lembke,47.0,male,gun,White,Aloha,OR,Not fleeing,False,False,"Washington County Sheriff's Office, OR",2015,1,armed,White
1,2015-01-02,Tim Elliot,53.0,male,gun,Asian,Shelton,WA,Not fleeing,False,True,"Mason County Sheriff's Office, WA",2015,1,armed,Asian
2,2015-01-03,John Paul Quintero,23.0,male,unarmed,Hispanic,Wichita,KS,Not fleeing,False,False,"Wichita Police Department, KS",2015,1,unarmed,Hispanic
3,2015-01-04,Kenneth Joe Brown,18.0,male,gun,White,Guthrie,OK,Not fleeing,False,False,"Oklahoma Highway Patrol, OK",2015,1,armed,White
4,2015-01-04,Michael Rodriguez,39.0,male,other,Hispanic,Evans,CO,Not fleeing,False,False,"Evans Police Department, CO",2015,1,armed,Hispanic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10425,2024-12-30,Timothy Woods,26.0,male,gun,Black,Orlando,FL,Fleeing - on foot,False,False,"Orlando Police Department, FL",2024,12,armed,Black
10426,2024-12-30,,38.0,male,knife,Unknown,Sacramento,CA,Unknown,True,True,"Sacramento County Sheriff's Department, CA",2024,12,armed,Unknown
10427,2024-12-30,Kenneth Thaddeus Roberts Jr.,48.0,male,gun,Black,Indianapolis,IN,Fleeing - in vehicle,False,False,"Indianapolis Metropolitan Police Department, I...",2024,12,armed,Black
10428,2024-12-31,Moses Alik,22.0,male,knife,Unknown,Celina,OH,Unknown,False,False,"Mercer County Sheriff's Office, OH",2024,12,armed,Unknown


In [38]:
df.to_csv("Dataset Police Violence.csv")