In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


# Read the CSV data
data = pd.read_csv('final_df.csv')

# Define columns with numeric data
numeric_columns = ['Percentage_Grocery_Area', 'Percentage_Bus_Area', 'Percentage_Train_Area', 'Percent_SW_Bus_Area', 
                   'Percent_SW_Train_Area', 'Percent_SW_Grocery_Area', 'Ratio_SW_To_Street', 'Low Vehicle Access',
                   'Total Population', '% Male', 'Median age', '18+%', 'White %', 'African American %', 
                   'American Indian %', 'Asian %', 'Pacific Islander %', 'Other %', 'Hispanic %', 'No HS', 
                   'HS', 'Associate or less', 'Bachelor or more', 'Total Poverty 25+ %', 'Median Earnings']

# Remove commas from numeric columns
data[numeric_columns] = data[numeric_columns].replace(',', '', regex=True).astype(float)

# Split the data into features and target
X = data[numeric_columns]
y = data['LILA']   

# Define a function to remove commas from string columns
def remove_commas(df):
    for col in df.columns:
        if df[col].dtype == 'object':  # Check if the column contains string values
            df[col] = df[col].str.replace(',', '')
    return df

# Create a transformer to remove commas
comma_remover = FunctionTransformer(remove_commas)

# Split the data into features and target
X = data[numeric_columns]
y = data['LILA']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model pipeline
pipeline = Pipeline([
    ('comma_remover', ColumnTransformer(transformers=[('remove_commas', comma_remover, numeric_columns)])),
    ('classifier', DecisionTreeClassifier(criterion="gini"))
])

# Fit the model pipeline
pipeline.fit(X_train, y_train)

# Make predictions
prediction_CART = pipeline.predict(X_test)

# Calculate baseline accuracy
baseline_accuracy = (len(y_test[y_test == 'Low']) / len(y_test)) * 100
print("Baseline Accuracy:", baseline_accuracy)

# Evaluate the predictions
def eval_prediction(pred, actual):
    correct = sum(pred == actual)
    return correct

print("CART:", eval_prediction(prediction_CART, y_test))
print("CART Accuracy:", pipeline.score(X_test, y_test))


Baseline Accuracy: 0.0
CART: 46
CART Accuracy: 0.92


In [None]:
import pandas as pd

# Read the CSV data
data = pd.read_csv('final_df.csv')

# Define columns with numeric data
numeric_columns = ['Percentage_Grocery_Area', 'Percentage_Bus_Area', 'Percentage_Train_Area', 'Percent_SW_Bus_Area', 
                   'Percent_SW_Train_Area', 'Percent_SW_Grocery_Area', 'Ratio_SW_To_Street', 'Low Vehicle Access',
                   'Total Population', '% Male', 'Median age', '18+%', 'White %', 'African American %', 
                   'American Indian %', 'Asian %', 'Pacific Islander %', 'Other %', 'Hispanic %', 'No HS', 
                   'HS', 'Associate or less', 'Bachelor or more', 'Total Poverty 25+ %', 'Median Earnings']

# Remove commas from numeric columns
data[numeric_columns] = data[numeric_columns].replace(',', '', regex=True).astype(float)

# Split the data into features and target
X = data[numeric_columns]
y = data['LILA']

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest.fit(X_train, y_train)

prediction_RF = random_forest.predict(X_test)  # Use X_test for predictions

baseline_accuracy = (len(y_test[y_test == 'Low']) / len(y_test)) * 100
print("Baseline Accuracy:", baseline_accuracy)

def eval_prediction(pred, actual):
    correct = sum(pred == actual)
    return correct

print("Random Forest:", eval_prediction(prediction_RF, y_test))
print("Random Forest Accuracy:", random_forest.score(X_test, y_test))


Baseline Accuracy: 0.0
Random Forest: 46
Random Forest Accuracy: 0.92


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

NB_model = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
NB_model.fit(X_train, y_train)

prediction_NB = NB_model.predict(X_test)  # Use X_test for predictions

# Using accuracy_score for evaluation
accuracy_train = accuracy_score(y_train, NB_model.predict(X_train))
accuracy_test = accuracy_score(y_test, prediction_NB)

print("Naive Bayes (Training Set):", accuracy_train)
print("Naive Bayes (Test Set):", accuracy_test)

Naive Bayes (Training Set): 0.7236180904522613
Naive Bayes (Test Set): 0.78


In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

k = 10
crossvalidation = KFold(n_splits=k, random_state=1, shuffle=True)

# k-fold cross-validation on the cart01 decision tree model
cart01 = DecisionTreeClassifier(criterion="gini", max_leaf_nodes=5)
cart_cv_scores = cross_val_score(cart01, X_train, y_train, cv=crossvalidation)
print("Cart cross-validation scores with k=10: ", cart_cv_scores)
print("Average score of all folds:", cart_cv_scores.mean())

# k-fold cross-validation on the c5.0 decision tree model
c5 = DecisionTreeClassifier(criterion="gini", max_leaf_nodes=5)
c5_cv_scores = cross_val_score(c5, X_train, y_train, cv=crossvalidation)
print("C5.0 cross-validation scores with k=10: ", c5_cv_scores)
print("Average score of all folds:", c5_cv_scores.mean())

# k-fold cross-validation on the random forest decision tree model
rf02 = RandomForestClassifier(n_estimators=10, criterion="gini")
random_forest_scores = cross_val_score(rf02, X_train, y_train, cv=crossvalidation)
print("Random Forest cross-validation scores with k=10: ", random_forest_scores)
print("Average score of all folds:", random_forest_scores.mean())

Cart cross-validation scores with k=10:  [0.9        0.8        1.         0.95       0.75       0.8
 0.9        0.85       0.85       0.94736842]
Average score of all folds: 0.874736842105263
C5.0 cross-validation scores with k=10:  [0.9        0.8        0.95       0.95       0.75       0.8
 0.9        0.9        0.85       0.94736842]
Average score of all folds: 0.874736842105263
Random Forest cross-validation scores with k=10:  [0.9        0.95       0.95       0.95       0.8        0.85
 0.95       0.95       0.85       0.94736842]
Average score of all folds: 0.9097368421052632


In [None]:
skimmed_df = data[['LILA', 'Percentage_Grocery_Area', 'Percentage_Bus_Area', 'Percentage_Train_Area', 'Percent_SW_Bus_Area', 
                       'Percent_SW_Train_Area', 'Percent_SW_Grocery_Area', 'Ratio_SW_To_Street', 'Low Vehicle Access',
                       'Total Population', '% Male', 'Median age', '18+%', 'White %', 'African American %', 
                       'American Indian %', 'Asian %', 'Pacific Islander %', 'Other %', 'Hispanic %', 'No HS', 
                       'HS', 'Associate or less', 'Bachelor or more', 'Total Poverty 25+ %', 'Median Earnings']]
skimmed_df

Unnamed: 0,LILA,Percentage_Grocery_Area,Percentage_Bus_Area,Percentage_Train_Area,Percent_SW_Bus_Area,Percent_SW_Train_Area,Percent_SW_Grocery_Area,Ratio_SW_To_Street,Low Vehicle Access,Total Population,...,Asian %,Pacific Islander %,Other %,Hispanic %,No HS,HS,Associate or less,Bachelor or more,Total Poverty 25+ %,Median Earnings
0,0.0,100.000000,100.000000,0.079401,100.000000,0.000000,100.000000,1.318388,0.0,1865.0,...,0.000,0.0,0.000,0.000,0.000,0.044,0.208,0.748,0.00,70865.0
1,1.0,88.690629,89.209792,0.079285,88.019348,0.000000,93.880678,1.391051,1.0,5934.0,...,0.003,0.0,0.041,0.145,0.241,0.275,0.386,0.098,0.19,24759.0
2,1.0,99.037916,62.301747,0.079378,69.271071,0.000000,99.351739,1.454830,1.0,4080.0,...,0.000,0.0,0.057,0.312,0.221,0.251,0.302,0.225,0.12,28621.0
3,0.0,51.526823,40.718601,0.078624,17.349277,0.000000,97.090419,0.961124,0.0,3948.0,...,0.077,0.0,0.037,0.087,0.019,0.140,0.283,0.558,0.02,51631.0
4,0.0,55.475474,0.078866,0.078866,0.000000,0.000000,54.468474,1.591162,0.0,1652.0,...,0.000,0.0,0.000,0.089,0.000,0.077,0.249,0.674,0.00,81724.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,0.0,100.000000,100.000000,67.825859,100.000000,63.700463,100.000000,1.189196,0.0,2336.0,...,0.000,0.0,0.124,0.254,0.049,0.130,0.386,0.435,0.14,29408.0
245,0.0,99.944559,89.266664,100.000000,94.001284,100.000000,100.000000,1.222004,1.0,3548.0,...,0.077,0.0,0.076,0.669,0.168,0.357,0.233,0.243,0.16,23909.0
246,0.0,100.000000,99.843438,23.039483,100.000000,30.052765,100.000000,1.482250,1.0,6488.0,...,0.038,0.0,0.114,0.401,0.272,0.271,0.310,0.147,0.12,24800.0
247,0.0,93.022094,55.169994,72.071857,68.944116,69.124597,99.501462,1.309342,0.0,3664.0,...,0.039,0.0,0.232,0.534,0.200,0.381,0.242,0.177,0.12,24225.0


In [None]:
# Check for NaN values in the DataFrame
nan_values = skimmed_df.isna()

# Check if there are any NaN values in each column
columns_with_nan = nan_values.any()

# Check if there are any NaN values in each row
rows_with_nan = nan_values.any(axis=1)

# Count the number of NaN values in each column
nan_count_per_column = nan_values.sum()

# Count the number of NaN values in each row
nan_count_per_row = nan_values.sum(axis=1)

# Print the results
print("Columns with NaN values:")
print(columns_with_nan)

print("\nRows with NaN values:")
print(rows_with_nan)

print("\nNumber of NaN values per column:")
print(nan_count_per_column)

print("\nNumber of NaN values per row:")
print(nan_count_per_row)

Columns with NaN values:
LILA                       False
Percentage_Grocery_Area    False
Percentage_Bus_Area        False
Percentage_Train_Area      False
Percent_SW_Bus_Area        False
Percent_SW_Train_Area      False
Percent_SW_Grocery_Area    False
Ratio_SW_To_Street         False
Low Vehicle Access         False
Total Population           False
% Male                     False
Median age                 False
18+%                       False
White %                    False
African American %         False
American Indian %          False
Asian %                    False
Pacific Islander %         False
Other %                    False
Hispanic %                 False
No HS                      False
HS                         False
Associate or less          False
Bachelor or more           False
Total Poverty 25+ %        False
Median Earnings            False
dtype: bool

Rows with NaN values:
0      False
1      False
2      False
3      False
4      False
       ...  
244 

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Columns: 113 entries, census_number to Median Earnings Graduate or Professional Degree
dtypes: float64(60), object(53)
memory usage: 219.9+ KB


In [None]:
#Use these column names instead, the target is LILA (Low income low access)
columns = skimmed_df.columns
print(columns)

Index(['LILA', 'Percentage_Grocery_Area', 'Percentage_Bus_Area',
       'Percentage_Train_Area', 'Percent_SW_Bus_Area', 'Percent_SW_Train_Area',
       'Percent_SW_Grocery_Area', 'Ratio_SW_To_Street', 'Low Vehicle Access',
       'Total Population', '% Male', 'Median age', '18+%', 'White %',
       'African American %', 'American Indian %', 'Asian %',
       'Pacific Islander %', 'Other %', 'Hispanic %', 'No HS', 'HS',
       'Associate or less', 'Bachelor or more', 'Total Poverty 25+ %',
       'Median Earnings'],
      dtype='object')


In [None]:
corr = data.corr()

ValueError: could not convert string to float: '2,866'

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=92d3bad9-b1e6-4572-bf43-ff2358b72657' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>