1. Identify all NAs in the data.
2. Summary Statistics Table
3. Scatterplots
4. Heat Maps (Correlations)
5. Hypothesis Testing
6. Regression

In [88]:
import pandas as pd
import numpy as np

walking_data = pd.read_csv("data_sources/walkable-cities.csv",encoding='latin-1')
# walking_data

In [2]:
#1. Identify all NAs in the data.

col_na = walking_data.isna().sum().to_frame().reset_index(level=0).set_axis(["variable","sum_na"], axis="columns", copy=False)
row_na = walking_data["city_state"]
row_na = pd.concat([row_na, walking_data.isna( ).sum(axis="columns")], axis=1)
row_na = row_na.set_axis(["city_state","sum_na"],axis="columns", copy=False)

col_na.to_csv("column_na_orig.csv", index = False)
row_na.to_csv("row_na_orig.csv", index = False)

# remove cols and rows with more than 15% NAs
nrow = len(walking_data)

keep_cols = list(walking_data.columns[col_na["sum_na"]/nrow <= .15])
walking_data = walking_data[keep_cols]

mcol = len(walking_data.columns)

row_na = walking_data["city_state"]
row_na = pd.concat([row_na, walking_data.isna( ).sum(axis="columns")], axis=1)
row_na = row_na.set_axis(["city_state","sum_na"],axis="columns", copy=False)

keep_rows = row_na["sum_na"]/mcol <= .15
walking_data = walking_data[keep_rows]

walking_data #725 rows × 66 columns

#calc NAs again
col_na = walking_data.isna().sum().to_frame().reset_index(level=0).set_axis(["variable","sum_na"], axis="columns", copy=False)
row_na = walking_data["city_state"]
row_na = pd.concat([row_na, walking_data.isna( ).sum(axis="columns")], axis=1)
row_na = row_na.set_axis(["city_state","sum_na"],axis="columns", copy=False)

col_na.to_csv("column_na_after.csv", index = False)
row_na.to_csv("row_na_after.csv", index = False)

walking_data.to_csv("walkable_cities.csv", index=False)

In [38]:
# variables
demographic = ["pop_estimate_2021", "land_area_sqkm", 
                "pop_per_km2", "white_alone", "black_or_african_american_alone", 
                "american_indian_and_alaska_native_alone", "asian_alone", 
                "native_hawaiian_and_other_pacific_islander_alone", 
                "some_other_race_alone", "two_or_more_races", "hispanic_or_latino"]
environment = ["median_aqi"]
health = ['access2', 'arthritis', 'binge', 'bphigh', 'bpmed', 'cancer', 'casthma',
       'cervical', 'chd', 'checkup', 'cholscreen', 'colon_screen', 'copd',
       'corem', 'corew', 'csmoking', 'dental', 'depression', 'diabetes',
       'ghlth', 'highchol', 'kidney', 'lpa', 'mammouse', 'mhlth', 'obesity',
       'phlth', 'sleep', 'stroke', 'teethlost', 'cumulative_confirmed',
       'cumulative_deceased']
name = ["place_code",
        "city", 
        "state", 
        "state_code", 
        "city_state", 
        "city_state_code", 
        "place_state_code", 
        "county_state",
        "geo_code"]
walkability = ["walkable",
        "walk_score",
        "bike_score"]
wealth = ["living_wage"]

response = ["walk_score"]

feature = demographic + environment + health + wealth
# feature


In [82]:
# 2. Summary Statistics Table
import xlwt
from xlwt import Workbook
walkable_cities = walking_data[walking_data["walkable"]=="walkable"]
car_cities = walking_data[walking_data["walkable"]=="car dependent"]

with pd.ExcelWriter("Summary Statistics.xlsx") as writer: 
        walking_data[response+feature].describe().to_excel(writer, sheet_name="All Cities")
        walkable_cities[response+feature].describe().to_excel(writer, sheet_name= "Walkable Cities")
        car_cities[response+feature].describe().to_excel(writer, sheet_name= "Car Dependent Cities")


In [36]:
# 3. Scatterplots


In [65]:
# 4. Heat Maps (Correlations)
my_cor = walking_data[response+feature].corr()['walk_score'].sort_values(ascending=False)
my_cor.to_csv("correlations.csv", index = False)
correlated = abs(my_cor) > .30
my_cor[correlated] # moderate to strong correlations 


walk_score               1.000000
pop_per_km2              0.712861
living_wage              0.457959
some_other_race_alone    0.388525
arthritis               -0.301249
csmoking                -0.301729
highchol                -0.313908
white_alone             -0.318448
depression              -0.333355
bphigh                  -0.340854
obesity                 -0.364233
Name: walk_score, dtype: float64

In [64]:
# 5. Hypothesis Testing
import scipy.stats as stats
import numpy as np
import pingouin as pg
import csv

final_result = []
# Conducting two-sample ttest
for feat in feature:
    result = pg.ttest(walkable_cities[feat],
                    car_cities[feat],
                    correction=True)

    final_result.append([feat, result])

with open('ttests.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        for row in final_result:
            csvwriter.writerow(row)   
 


In [105]:
from scipy.optimize import curve_fit

def interp(data):

    # Do the original interpolation
    data.interpolate(method='nearest', xis=0, inplace=True)

    # # Display result
    # print ('Interpolated data:')
    # print (data)
    # print ()

    # Function to curve fit to the data
    def func(x, *params):
        y = 0
        for i in range(len(params)):
            y += params[i] * (x ** i)
        return y

    # Initial parameter guess, just to kick off the optimization
    guess = [1] * len(data.columns)

    # Create copy of data to remove NaNs for curve fitting
    fit_data = data.dropna()

    # Place to store function parameters for each column
    col_params = {}

    # Curve fit each column
    for col in fit_data.columns:
        # Get x & y
        x = fit_data.index.astype(float).values
        y = fit_data[col].values
        # Curve fit column and get curve parameters
        params = curve_fit(func, x, y, guess)
        # Store optimized parameters
        col_params[col] = params[0]

    # Extrapolate each column
    for col in data.columns:
        # Get the index values for NaNs in the column
        x = data[pd.isnull(data[col])].index.astype(float).values
        # Extrapolate those points with the fitted function
        data[col][x] = func(x, *col_params[col])

    # # Display result
    # print ('Extrapolated data:')
    # print (data)
    # print ()

    # print ('Data was extrapolated with these column functions:')

    # # Iterate through the number of parameters in the col_params dictionary and add the corresponding term 
    # # for each parameter in the string that is being printed.
    # for col in col_params:
    #     terms = []
    #     for i in range(len(col_params[col])):
    #         terms.append("{:0.3e} x^{}".format(col_params[col][i], i))
    #     print("f_{}(x) = {}".format(col, " + ".join(terms)))

    return data

In [109]:
# 6. Regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = walking_data[feature]
y = walking_data[response]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X_train = pd.DataFrame(X_train).set_axis(feature, axis='columns')
X_test = pd.DataFrame(X_test).set_axis(feature, axis='columns')
y_train = pd.DataFrame(y_train).set_axis(response, axis='columns')
y_train = pd.DataFrame(y_train).set_axis(response, axis='columns')

X_train = interp(X_train)
X_test = interp(X_test)





0

In [None]:
# PCA - Principal component analysis

# Unsupervised Machine Learning Algorithm for Dimensionality Reduction

# normalize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Create an imputer object with strategy='mean' or 'median'
imputer = SimpleImputer(strategy='mean')

# Fit and transform the data using the imputer
data_scaled = imputer.fit_transform(data_scaled)

#Initialize the PCA model and specify the number of components
pca = PCA(n_components=2)

# Fit the PCA model to your data
pca.fit(data_scaled)

# Apply the dimensionality reduction to your data
data_reduced = pca.transform(data_scaled)

In [112]:
# 6. Regression
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import confusion_matrix

imputer = SimpleImputer(strategy='mean')

pca = PCA(n_components = 2)
  
# Fit the PCA model to your data
pca.fit(X_train)

# Apply the dimensionality reduction to your data
data_reduced = pca.transform(X_train)

data_reduced

components = pca.components_

# Print the names of the features for each principal component
for i, component in enumerate(components):
    print(f"Principal Component {i+1}:")
    for j, feature_coefficient in enumerate(component):
        print(f"{feature_names[j]}: {feature_coefficient}")
    print("\n")
    
# explained_variance = pca.explained_variance_ratio_

# classifier = LogisticRegression(random_state = 0)
# classifier.fit(X_train, y_train)

# cm = confusion_matrix(y_test, y_pred)

Principal Component 1:


NameError: name 'feature_names' is not defined