## This is the crime thing Dr. Page asked us to do

In [7]:
import warnings
import shutup; shutup.please()
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor


In [8]:
# Read the data

df = pd.read_csv('data/CommViolPredUnnormalizedData.txt', encoding='latin-1',header=None)
df.columns
#the column names are the second word in each row of data/crime_headings.txt
with open('data/crime_headings.txt') as f:
    headings = f.readlines()
col_names = []
types = []
for heading in headings:
    if len(heading.split()) <= 1:
        continue
    col_names.append(heading.split()[1])
    if heading.split()[2] == 'numeric':
        types.append(float)
    else:
        types.append(str)

df.columns = col_names

#drop drop rows with "?" values
df = df.replace('?', np.nan)

df = df.astype(dict(zip(col_names, types)))

#communityname, countyCode, communityCode, fold are not predictive so drop them
df = df.drop(['communityname', 'countyCode', 'communityCode', 'fold'], axis=1)
df.head()







Unnamed: 0,State,pop,perHoush,pctBlack,pctWhite,pctAsian,pctHisp,pct12-21,pct12-29,pct16-24,...,burglaries,burglPerPop,larcenies,larcPerPop,autoTheft,autoTheftPerPop,arsons,arsonsPerPop,violentPerPop,nonViolPerPop
0,NJ,11980.0,3.1,1.37,91.78,6.5,1.88,12.47,21.44,10.93,...,14.0,114.85,138.0,1132.08,16.0,131.26,2.0,16.41,41.02,1394.59
1,PA,23123.0,2.82,0.8,95.57,3.44,0.85,11.01,21.3,10.48,...,57.0,242.37,376.0,1598.78,26.0,110.55,1.0,4.25,127.56,1955.95
2,OR,29344.0,2.43,0.74,94.33,3.43,2.35,11.36,25.88,11.01,...,274.0,758.14,1797.0,4972.19,136.0,376.3,22.0,60.87,218.59,6167.51
3,NY,16656.0,2.4,1.7,97.35,0.5,0.7,12.55,25.2,12.19,...,225.0,1301.78,716.0,4142.56,47.0,271.93,,,306.64,
4,MN,11245.0,2.76,0.53,89.16,1.17,0.52,24.46,40.53,28.69,...,91.0,728.93,1060.0,8490.87,91.0,728.93,5.0,40.05,,9988.79


In [9]:
#useful functions
def drop_rows_missing_target(df):
    return df.dropna(subset=['nonViolPerPop'])


def fill_with_mean(df):
    for column in df.columns:
        if df[column].dtype == float:
            df[column] = df[column].fillna(df[column].mean())
    return df

def fill_with_median(df):
    for column in df.columns:
        if df[column].dtype == float:
            df[column] = df[column].fillna(df[column].median())
    return df

def convert_categorical_to_numeric(df):
    for column in df.columns:
        if df[column].dtype != float:
            df[column] = df[column].astype('category')
            df[column] = df[column].cat.codes
    return df

def normalize(df):
    for column in df.columns:
        if df[column].dtype == float:
            df[column] = (df[column] - df[column].mean()) / df[column].std()
    return df

def remove_random_features(df, n):
    dropped_cols = np.random.choice(df.columns[:-1], n, replace=False)
    return df.drop(dropped_cols, axis=1), dropped_cols


# pseudo-Explainable Approach - Random Forest Regressor

In [10]:

def feature_selection(df, features_as_targets):
    #drop the last 18 columns
    if features_as_targets:
        X = df.drop(df.columns[-1:], axis=1)
    else:
        X = df.drop(df.columns[-18:], axis=1)
        
    y = df['nonViolPerPop']
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X, y)
    importance = rf.feature_importances_
    # print(np.sort(importance))
    indices = np.argsort(importance)[::-1]
    kept_cols = []
    for f in range(X.shape[1]):
        if importance[indices[f]] > 0.001:
            kept_cols.append(X.columns[indices[f]])
    #put nonViolPerPop back in
    kept_cols.append('nonViolPerPop')
    return df[kept_cols], kept_cols


def fine_tune_features_approach(df, features_as_targets = False, do_normalize = False):
    if do_normalize:
        black_box_df = fill_with_mean(normalize(drop_rows_missing_target(convert_categorical_to_numeric(df))))
    else: 
        black_box_df = fill_with_mean(drop_rows_missing_target(convert_categorical_to_numeric(df)))


    black_box_df, kept_cols = feature_selection(black_box_df, features_as_targets)

    train, test = train_test_split(black_box_df, test_size=0.2)

    X_train = train.drop(train.columns[-1], axis=1)
    y_train = train['nonViolPerPop']
    X_test = test.drop(test.columns[-1], axis=1)
    y_test = test['nonViolPerPop']

    # Create the model with 100 trees
    model = RandomForestRegressor(n_estimators=100,
                                    bootstrap = True,
                                    max_features = 'sqrt')
    # Fit on training data

    model.fit(X_train, y_train)

    # Actual class predictions
    rf_predictions = model.predict(X_test)

    #calculate mae
    mae = np.mean(abs(rf_predictions - y_test))
    print('Mean Absolute Error:', mae)
    print('Kept columns:', kept_cols)
    print()
    
print('Using targets as features (normalize):')
fine_tune_features_approach(df, True, True)
print('\n\nNot using targets as features (normalize):')
fine_tune_features_approach(df, False, True)
print('\n\nUsing targets as features (no normalize):')
fine_tune_features_approach(df, True, False)
print('\n\nNot using targets as features (no normalize):')
fine_tune_features_approach(df, False, False)


Using targets as features (normalize):
Mean Absolute Error: 0.07604501269206164
Kept columns: ['larcPerPop', 'burglPerPop', 'autoTheftPerPop', 'robbbPerPop', 'pctWhite', 'nonViolPerPop']



Not using targets as features (normalize):
Mean Absolute Error: 0.4488394962321919
Kept columns: ['pctKids2Par', 'pct2Par', 'pctAllDivorc', 'rentLowQ', 'pct12-17w2Par', 'houseVacant', 'pctHousOccup', 'persHomeless', 'pctMaleDivorc', 'pctFemDivorc', 'pctPopDenseHous', 'kidsBornNevrMarr', 'persPerRenterOccup', 'persPoverty', 'pctWhite', 'State', 'pctWorkMom-6', 'landArea', 'pctSmallHousUnits', 'pctSameState-5', 'rentQrange', 'pctEmployProfServ', 'blackPerCap', 'pctAsian', 'pctBlack', 'pctWfarm', 'pctVacant6up', 'pctKids-4w2Par', 'popDensity', 'ownHousLowQ', 'pctHousWOplumb', 'pctHousWOphone', 'pctEmployMfg', 'pctRetire', 'otherPerCap', 'medOwnCostpct', 'medYrHousBuilt', 'pctVacantBoarded', 'hispPerCap', 'pctSameCounty-5', 'NAperCap', 'pctSameHouse-5', 'pctPoverty', 'medOwnCostPctWO', 'pctBornStateResi

In [11]:
# https://www.geeksforgeeks.org/random-forest-regression-in-python/

def random_forest_approach(df, features_as_targets = False, do_normalize = False):
    for i in range(1):
        black_box_df = df.copy()

        if not features_as_targets:
            black_box_df = black_box_df.drop(black_box_df.columns[-17:-1], axis=1)
        if do_normalize:
            black_box_df = fill_with_mean(normalize(drop_rows_missing_target(convert_categorical_to_numeric(black_box_df))))
        else:
            black_box_df = fill_with_mean(drop_rows_missing_target(convert_categorical_to_numeric(black_box_df)))


        train, test = train_test_split(black_box_df, test_size=0.2)
        X_train = train.drop(train.columns[-1], axis=1)
        y_train = train['nonViolPerPop']
        X_test = test.drop(test.columns[-1], axis=1)
        y_test = test['nonViolPerPop']

        # Create the model with 100 trees
        model = RandomForestRegressor(n_estimators=100,
                                        bootstrap = True,
                                        max_features = 'sqrt')
        # Fit on training data
        model.fit(X_train, y_train)

        # Actual class predictions
        rf_predictions = model.predict(X_test)

        #calculate mae
        mae = np.mean(abs(rf_predictions - y_test))
        print('Mean Absolute Error:', mae)

        
        
print('Using targets as features (normalize):')
random_forest_approach(df, True, True)
print('\n\nNot using targets as features (normalize):')
random_forest_approach(df, False, True)
print('\n\nUsing targets as features (no normalize):')
random_forest_approach(df, True, False)
print('\n\nNot using targets as features (no normalize):')
random_forest_approach(df, False, False)




Using targets as features (normalize):
Mean Absolute Error: 0.22052116640642036


Not using targets as features (normalize):
Mean Absolute Error: 0.4794718581961971


Using targets as features (no normalize):
Mean Absolute Error: 646.535995754717


Not using targets as features (no normalize):
Mean Absolute Error: 1171.0694393867925


# Explainable - Lasso

In [12]:
#doing lasso 
from sklearn.linear_model import LassoCV

def lasso(df, features_as_targets, do_normalize):
    if do_normalize:
        lasso_df = fill_with_mean(normalize(drop_rows_missing_target(convert_categorical_to_numeric(df))))
    else:
        lasso_df = fill_with_mean(drop_rows_missing_target(convert_categorical_to_numeric(df)))



    if features_as_targets:
        X = lasso_df.drop(lasso_df.columns[-1:], axis=1)
    else:
        X = lasso_df.drop(lasso_df.columns[-18:], axis=1)
    y = lasso_df['nonViolPerPop']

    alpha_predict = LassoCV(cv=5, random_state=0, max_iter=10000)
    alpha_predict.fit(X, y)
    print("alpha: " + str(alpha_predict.alpha_))
    model = Lasso(alpha=alpha_predict.alpha_)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = np.mean(abs(y_pred - y_test))
    print('Mean Absolute Error:', mae)
    coef_dict = { k:v for (k,v) in zip(model.coef_, X)}
    #sort coef_dict by abs value
    coef_dict = {k: v for k, v in sorted(coef_dict.items(), key=lambda item: abs(item[0]), reverse=True)}

    print('kept Columns:', [f'{v}: {k:.5f}' for k,v in coef_dict.items() if k > 0.00001 or k < -0.00001])
    #print R squared
    print('R squared:', model.score(X_test, y_test))




print('Using targets as features (normalized):')
lasso(df, True, True)
print('\n\nNot using targets as features (normalized):')
lasso(df, False, True)
print('\n\nUsing targets as features (not normalized):')
lasso(df, True, False)
print('\n\nNot using targets as features (not normalized):')
lasso(df, False, False)


Using targets as features (normalized):
alpha: 0.0011443488073478402
Mean Absolute Error: 0.0011888445823240094
kept Columns: ['larcPerPop: 0.69853', 'burglPerPop: 0.27936', 'autoTheftPerPop: 0.18384', 'arsonsPerPop: 0.01364']
R squared: 0.9999977157988518


Not using targets as features (normalized):
alpha: 0.003747226429140997
Mean Absolute Error: 0.45009916973494546
kept Columns: ['pctForeignBorn: 0.27864', 'pctKids2Par: -0.27127', 'pctWsocsec: 0.20505', 'pctPoverty: 0.15096', 'popDensity: -0.13780', 'pctEmploy: 0.13577', 'pctLowEdu: -0.12349', 'pctMaleNevMar: 0.11704', 'pctMaleDivorc: 0.11179', 'pctRetire: -0.10782', 'pctImmig-8: -0.10683', 'pctFgnImmig-10: 0.09722', 'pctKidsBornNevrMarr: 0.09693', 'perHoush: -0.08886', 'pct12-29: -0.08309', 'pctLargHous: 0.08168', 'whitePerCap: 0.08090', 'medOwnCostpct: -0.08086', 'houseVacant: 0.07387', 'medYrHousBuilt: 0.06661', 'pctEmployMfg: -0.06495', 'pctSpeakOnlyEng: -0.06250', 'ownHousUperQ: -0.06214', 'pctSmallHousUnits: -0.06123', 'kidsB

# Decision Tree