# Chicago Crime Rates #

We would ultimately like to identify where the most crime rates happen throughout Chicago on a map that can be visualized. We can only use the data from 2012, as the CSV file is too big for JupyterHub to handle.

In [None]:
# import pandas, seaborn, numpy, and matplotlib
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# importing datetime modules
from datetime import date
from datetime import time
import datetime

# importing calendar
import calendar

In [None]:
# Reading the CSV file to understand how the formatting looks from importing
df = pd.read_csv("test.csv", index_col = "ID", parse_dates = ["Date"])
df = df.dropna()

# Creating the 'Time' variable for the dataframe df
df['Date'] = df['Date'].astype(str)
time = df.Date.str.slice(-8,-3)

# Creating the 'Date' variable for dataframe df
date = df.Date.str.slice(0,11)

# Adding 2 new columns to df called 'Time' and 'Date_Refurbished'
df['Time'] = time
df['Date_Refurbished'] = date

# Changing datatypes of 'Date' and 'Time' columns
df['Date_Refurbished'] = pd.to_datetime(df['Date_Refurbished'])

# Changing ALL Column Names because spaces suck
df.rename(columns={'Primary Type':'Primary_Type', 'Case Number':'Case_Number', 'Location Description':'Location_Description', 'Community Area':'Community_Area', 'FBI Code':'FBI_Code', 'X Coordinate':'X_Coordinate', 'Y Coordinate':'Y_Coordinate',}, inplace = True)

# Dropping "Updated On" and "Unnamed: 0" column as we do not see a use for it
del df["Updated On"]
del df["Unnamed: 0"]

# Display the first few rows of the DataFrame
df.head()

In [None]:
# Want to see the information for each column
df.info()

In [None]:
# Creating a bar chart showing the count of different types of crimes in Chicago
# I have to identify the unique values that are in the column "Year"
x_unique = df.Primary_Type.unique()
print(x_unique)
print(len(x_unique))

# Show initial bar chart of 'Primary_Type's of crimes in Chicago
plt.barh(x_unique, df['Primary_Type'].value_counts())
plt.xlabel('Type of Crime Committed')
plt.ylabel('Total Count')
plt.title('Overview of Crimes Committed')

# Value counts for each x_unique identifiers
df['Primary_Type'].value_counts()

In [None]:
# Value counts for each x_unique identifier greater than 500
x_unique_500 = df['Primary_Type'].value_counts()[df['Primary_Type'].value_counts()>500]

# Create dataframe for x_unique_500 variable
df_2 = pd.DataFrame(x_unique_500)

# Renaming Index and Column for df_2 dataframe
df_2.index.rename('Primary_Type', inplace=True)
df_2.rename(columns={'Primary_Type':'Total_Count'}, inplace=True)

# Horizontal plot showing the values that are greater than 500
plt.barh(df_2.index.values,df_2['Total_Count'])
plt.xlabel('Total Count of Crime')
plt.ylabel('Type of Crime')
plt.title('Crimes that have happened 500+ Times')

We want to only show the primary types of crime that were document more than 500 times. Below, we are going to create a plot showing the primary types of crime that were documented less than 500 times.

In [None]:
# Creating a new variable that has all 'Primary_Types' less than 500 times documented
x_unique_500_1 = df['Primary_Type'].value_counts()[df['Primary_Type'].value_counts()<500]

# Creating a new DataFrame with the x_unique_500_1 variable
df_3 = pd.DataFrame(x_unique_500_1)

# Renaming the Index and Column for df_3 dataframe
df_3.index.rename('Primary_Type', inplace=True)
df_3.rename(columns={'Primary_Type':'Total_Count'}, inplace=True)

# Plotting df_3 with a horizontal bar chart
plt.barh(df_3.index.values,df_3['Total_Count'])
plt.xlabel('Total Count')
plt.ylabel('Type of Crime')
plt.title('Crimes that have happened 500+ Times')

### Visualized the locations of crimes throughout Chicago ###

In [None]:
# Show the amount of times that crimes occur in different types of locations
different_locations = df['Location_Description'].value_counts().head()

print(different_locations)

In [None]:
# Create seperate variables for the x/y variable
x_location_unique = df.Location_Description.unique()

# Creating a variable for locations appearing 100 times or more; as well as the dataframe for the variable
x_location_100 = df['Location_Description'].value_counts()[df['Location_Description'].value_counts()>100]
df_4 = pd.DataFrame(x_location_100)

# Renaming the column headers for the new dataframe
df_4.index.rename('Location_Description', inplace=True)
df_4.rename(columns={'Location_Description':'Total_Count'}, inplace=True)

# Creating a horizontal bar chart of locations that appeared 100 times or more
plt.barh(df_4.index.values, df_4['Total_Count'])
plt.xlabel('Total Count')
plt.ylabel('Location of Crime')
plt.title('Crimes that happened 100 or more times at a Location')

### Understanding if there are certain days that crimes tend to happen more often on ###

In [None]:
# Show the amount of crimes that happened on the day (total)
different_times = df['Date_Refurbished'].value_counts().head()
print(different_times)

#### Why is this weird? ####

When looking through our data, it seems that there are days that seem to have a lot of crimes be committed in a day, and then the next day there may be one. The conclusion that we have come to is that either the 'Date' that is listed as a column is the 'Date' that the crime was entered into the system, or that we could not load the entire data set into JupyterHub because it would crash the server.

### Time of Day that Crimes Occur ###

In [None]:
# Creating a variable that will display the amount of crimes that occur at a certain time (Military Time)
different_hours = df['Time'].value_counts().head()
print(different_hours)

# Creating a new dataframe for the 'Time' column; renaming columns to appropriate titles
df_5 = pd.DataFrame(different_hours)
df_5.index.rename('Time', inplace=True)
df_5.rename(columns={'Time':'Total_Count'}, inplace=True)

# Creating a scatter chart for the 'Time' column to show differences in crimes committed in certain hours
plt.scatter(df.index.values, df_5['Total_Count'])