In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

In [3]:
df = pd.read_csv('datasets/crime.csv')
df.head(20)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
5,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Wednesday,INGLESIDE,NONE,0 Block of TEDDY AV,-122.403252,37.713431
6,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,INGLESIDE,NONE,AVALON AV / PERU AV,-122.423327,37.725138
7,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,BAYVIEW,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564
8,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,RICHMOND,NONE,600 Block of 47TH AV,-122.508194,37.776601
9,2015-05-13 23:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,CENTRAL,NONE,JEFFERSON ST / LEAVENWORTH ST,-122.419088,37.807802


In [4]:
crimes = df['Descript'].unique()
df['Category'].unique()

array(['WARRANTS', 'OTHER OFFENSES', 'LARCENY/THEFT', 'VEHICLE THEFT',
       'VANDALISM', 'NON-CRIMINAL', 'ROBBERY', 'ASSAULT', 'WEAPON LAWS',
       'BURGLARY', 'SUSPICIOUS OCC', 'DRUNKENNESS',
       'FORGERY/COUNTERFEITING', 'DRUG/NARCOTIC', 'STOLEN PROPERTY',
       'SECONDARY CODES', 'TRESPASS', 'MISSING PERSON', 'FRAUD',
       'KIDNAPPING', 'RUNAWAY', 'DRIVING UNDER THE INFLUENCE',
       'SEX OFFENSES FORCIBLE', 'PROSTITUTION', 'DISORDERLY CONDUCT',
       'ARSON', 'FAMILY OFFENSES', 'LIQUOR LAWS', 'BRIBERY',
       'EMBEZZLEMENT', 'SUICIDE', 'LOITERING',
       'SEX OFFENSES NON FORCIBLE', 'EXTORTION', 'GAMBLING', 'BAD CHECKS',
       'TREA', 'RECOVERED VEHICLE', 'PORNOGRAPHY/OBSCENE MAT'],
      dtype=object)

In [19]:
def createProbabilites(category):
    probabilities = {}
    
    #Finds total words in column and unique words in column
    df_split = df[df['Category'] == category]
    words = [text.split() for text in df_split['Descript']]
    words_set = [word for sublist in words for word in sublist]
    unique_words = set(words_set)
    total_words = sum(text.count(' ') + 1 for text in df_warrants['Descript'])
    
    #Finds how many times each unique word occurs in description column
    for word in unique_words:
        count = 0
        for i in range(len(df_split)):
            if word in df_split.iloc[i]['Descript']:
                count += 1
        #Calculates probability that the word appears in description
        prob = count / total_words
        probabilities[word] = prob
    
    return probabilities

In [22]:
warrant_probs = createProbabilites('WARRANTS')

In [28]:
word = 'WARRANTS'
keys_with_highest_values = sorted(warrant_probs, key=warrant_probs.get, reverse=True)[:3]

{'WARRANTS': ['WARRANT', 'ARREST', 'TO']}

In [None]:
most_common_words = {}
for category in df['Category'].unique():
    probs = createProbabilites(category)
    keys_with_highest_values = sorted(probs, key=probs.get, reverse=True)[:3]
    print(category + ': ' + str(keys_with_highest_values))
    most_common_words[category] = keys_with_highest_values

WARRANTS: ['WARRANT', 'ARREST', 'TO']


In [31]:
most_common_words

{'WARRANTS': ['WARRANT', 'ARREST', 'TO']}

In [7]:
def calculate_entropy(column):
    # Calculate entropy of a given column
    value_counts = column.value_counts()
    probabilities = value_counts / len(column)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy


In [8]:
def calculate_information_gain(data, feature_column, target_column):
    # Calculate entropy of the entire target column
    total_entropy = calculate_entropy(data[target_column])

    # Calculate entropy after the split based on the feature column
    unique_values = data[feature_column].unique()
    weighted_entropy_after_split = 0

    for value in unique_values:
        subset = data[data[feature_column] == value]
        subset_entropy = calculate_entropy(subset[target_column])
        weighted_entropy_after_split += (len(subset) / len(data)) * subset_entropy

    # Calculate information gain
    information_gain = total_entropy - weighted_entropy_after_split
    return information_gain

In [9]:
category_entropy = calculate_entropy(df['Category'])
category_entropy

3.86689113122643

In [10]:
node1 = []
for i in df:
    info = (calculate_information_gain(df, 'Category', i))
    if i != 'Category' and i != 'X' and i != 'Y':
        print(i + ": " + str(info))
    

Dates: 2.672202504629789
Descript: 3.8583087939997935
DayOfWeek: 0.005054615852106092
PdDistrict: 0.09421120321720222
Resolution: 0.6062725380527971
Address: 0.7990650256962439


In [11]:
node2 = []
for i in df:
    info = (calculate_information_gain(df, 'Descript', i))
    if i != 'Category' and i != 'X' and i != 'Y' and i!='Descript':
        print(i + ": " + str(info))

Dates: 5.436648597090818
DayOfWeek: 0.015744213497070447
PdDistrict: 0.18439264111698117
Resolution: 0.9178786346152843
Address: 2.108404787544414
