# <font color='#eb3483'> COMPAS ANALYSIS </font>


Our group analyzed the COMPAS Dataset to understand criminal recidivism more clearly.

We hypothesized that a machine learning analysis of the COMPAS Two-Year Excel Spreadsheets would describe a higher likelihood of African-American surveyors 

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('compas-analysis-master/compas-scores-two-years.csv') 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

#This is so that you don't see a bunch of code 'warnings' (things that you could change but don't have to right now)
import warnings
warnings.filterwarnings("ignore")

import seaborn as sns

#This makes all of our graphs show up in our notebook when they're made
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

sns.set(rc={'figure.figsize':(6,6)}) 
%matplotlib inline

## <font color='#eb3483'> Introduction </font>


### <font color='#eb3483'> Understanding the Initial Dataset </font>

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

### <font color='#eb3483'> Unnecessary/Duplicate Column Removal </font>

In [None]:
del_col_list = ['id', 'name', 'first', 'last', 'c_case_number',
                'r_case_number', 'vr_case_number']

df = df.drop(del_col_list, axis=1)
df.head()

In [None]:
print(df.shape)

# Rows containing duplicate data
duplicate_rows_df = df[df.duplicated()]

print(duplicate_rows_df.shape)

## <font color='#eb3483'> Understanding The Top Offenders </font>

In [None]:
top_reoffenders = df.sort_values(by ='priors_count', ascending=False).head()
top_reoffenders

In [None]:
def find_min_max_in(col):
    top = df[col].idxmax()
    top_df = pd.DataFrame(df.loc[top])
    
    bottom = df[col].idxmin()
    bottom_df = pd.DataFrame(df.loc[bottom])
    
    info_df = pd.concat([top_df, bottom_df], axis=1)
    return info_df

find_min_max_in('priors_count')

### <font color='#eb3483'> Linear Regression </font>

In [None]:
two_years = pd.read_csv("compas-analysis-master/compas-scores-two-years.csv")

In [None]:
target_variable = "priors_count"
independent_variables = numerical_data.drop(columns=target_variable).columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
     numerical_data[independent_variables],   # X
     numerical_data[target_variable], # y
     test_size=0.2,   # % of the data that goes to the test dataset
     random_state=13   # ensure reproductibility
)

In [None]:
print('X train', X_train.shape)
print('y train', y_train.shape)
print('X test', X_test.shape)
print('y test', y_test.shape)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# get intercept
print(model.intercept_)

# get slope
model.coef_?

In [None]:
X = X_test.reset_index().copy() # make a copy of indices and data
X["priors_count"] = y_test.tolist()
X["prediction"] = predictions
X.head()

In [None]:
sns.scatterplot(x=X["two_year_recid"], y=X["priors_count"], label = 'True')
sns.scatterplot(x=X["two_year_recid"], y=X["prediction"], label = 'Predictions')
sns.mpl.pyplot.ylabel("Priors Count")

In [None]:
sns.scatterplot(x=X["juv_fel_count"], y=X["priors_count"], label = 'True')
sns.scatterplot(x=X["juv_fel_count"], y=X["prediction"], label = 'Predictions')
sns.mpl.pyplot.ylabel("Priors Count")

## <font color='#eb3483'> Statistics </font>

In [None]:
import scipy.stats as stats

In [None]:
survey.to_csv("data/survey.csv", index=False)

In [None]:
survey.race.describe()

In [None]:
survey.age.describe()

In [None]:
survey.priors_count.describe()

In [None]:
survey.race.mode()

In [None]:
survey.sex.mode()

In [None]:
survey.groupby('race')["priors_count"].mean()

In [None]:
survey[['juv_fel_count', 'priors_count']].corr()

In [None]:
ax = sns.scatterplot(x="race", y="priors_count", data=df, hue = "is_recid")

## <font color='#eb3483'> Machine Learning </font>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
compas = pd.read_csv('data/compas-scores.csv')
compas.head()

In [None]:
sns.pairplot(compas)

In [None]:
from ipywidgets import interact, fixed

#Create our plotting function
def plotRecidivismPercent(df, col):
    #Check out what average recidivism is for each potential value in a collumn
    compas.groupby(col).agg({'Two_yr_Recidivism':np.mean}).plot.barh()

#Let's look at all columns (you might want to drop numeric columns 
#that aren't binary but you can also just ignore that graph)
columns_to_plot = compas.drop('Two_yr_Recidivism',axis=1).columns
interact(plotRecidivismPercent, 
         col=columns_to_plot, df=fixed(compas));

In [None]:
#Get our data into the right format
X = compas.drop('Two_yr_Recidivism', axis=1)
Y = compas['Two_yr_Recidivism']
X_tr, X_test, Y_tr, Y_test = train_test_split(X,Y, test_size = 0.2)

In [None]:
Y_tr

In [None]:
#Instantiate our logistic regression model
logreg = LogisticRegression()

#Fit our training data
logreg.fit(X_tr, Y_tr)

#Predict on our test data
predictions = logreg.predict(X_test)

predictions[:10]

In [None]:
#Check accuracy
print("Accuracy: ", (predictions == Y_test).mean())

In [None]:
from sklearn.model_selection import cross_val_score
logreg = LogisticRegression()
cross_val_score(logreg, X, Y, scoring="accuracy", 
                cv=10).mean()

In [None]:
logreg = LogisticRegression()
cross_val_score(logreg, X, Y, scoring="roc_auc", 
                cv=10).mean()

In [None]:
logreg = LogisticRegression()

#Fit our training data
logreg.fit(X_tr, Y_tr)

#Predict on our test data
for col in ['African_American', 'Asian', 'Hispanic', 'Native_American', 'Other']:
    predictions = logreg.predict(X_test[X_test[col] == 1])
    print("Accuracy (%s): "%col, (predictions == Y_test[X_test[col] == 1]).mean())

## <font color='#eb3483'> Aggregations </font>


### <font color='#eb3483'> Quick knowledge check! </font>

## <font color='#eb3483'> Slicing </font>

### <font color='#eb3483'> Quick knowledge check! </font>

## <font color='#eb3483'> Fancy Indexing </font>


### <font color='#eb3483'> Quick knowledge check! </font>