### Train Model with Sport Road Segments 

In [9]:
#import statements
import pandas as pd
#train_test_split seperates the data into a training subset and a testing subset (two bins)
##training is what creeates the machine learning model
##testing is what we use for accuracy assessment
from sklearn.model_selection import train_test_split
#Linear regression is our model type
from sklearn.linear_model import LinearRegression
#mean_squared_error and r2_score are our two ways of evaluating accuracy
from sklearn.metrics import mean_squared_error, r2_score
#OneHotEncoder --> This converts categorical data (i.e. sport type) into 'indicator' columns
##indicator columns are yes/no for each category in the datatype
from sklearn.preprocessing import OneHotEncoder

#read in our data that is a 20% sample of the BIG dataset
data = pd.read_csv('SampledData.csv')

#we have a bad type of sport (line) --> remove it
data_filtered = data[data['Sport'] != 'line']

#categorical columns. We need to OneHotEncode these
cat_columns = ['Sport', 'Evening?', 'Weekday?']

#we are transforming the data from having one column for each 'sport'/'timestamp'/'Weekday?' to multiple columns
##Ex. There will be 4 columns for sport now. 'sport_Football', 'sport_hockey', 'sport_volleyball', 'sport_basketball'
##These 4 columns will take 1/0 for yes/no if the column is that type. 
encoder = OneHotEncoder()
encoded_columns = pd.DataFrame(encoder.fit_transform(data_filtered[cat_columns]).toarray(), index=data_filtered.index)
encoded_columns.columns = encoder.get_feature_names_out(cat_columns)

#numerical columns --> numbers that do not need any special processing
num_columns = data_filtered[['Latitude', 'Longitude']]

#put all our columns together
X = pd.concat([num_columns, encoded_columns], axis=1)

#y is our response column aka what we want our machine learning model to predict
y = data_filtered['delta_cost']

#split the dataset into training and testing datasets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#create our model type
model = LinearRegression()

#train our model on the 'training' subset of our data
model.fit(X_train, y_train)

#make predictions on the 'testing' set of data so we can do accuracy assessment
y_pred = model.predict(X_test)

#accuracy assessment
##get MSE using function comparing testing data to what the model predicts
mse = mean_squared_error(y_test, y_pred)
##get R2 using function comparing testing data to what the model predicts
r2 = r2_score(y_test, y_pred)

#print the evaluation metrics
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

#print the coefficients and intercept of the model
##coeficients are slope for each variable
print("\nModel Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef}")
print("Intercept:", model.intercept_)

Mean Squared Error: 0.03234998079128853
R-squared Score: 0.0016782138507538402

Model Coefficients:
Latitude: -0.24151785810395376
Longitude: -0.17293293875656413
Sport_Basketball: 474261199.3205644
Sport_Football: 474261199.3193297
Sport_Hockey: 474261199.3192995
Sport_Volleyball: 474261199.32063204
Evening?_False: -4444109681.926476
Evening?_True: -4444109681.929732
Weekday?_False: 23436213213.59019
Weekday?_True: 23436213213.592342
Intercept: -19466364736.218094


### Applying the model in the real world

In [10]:
#Here we input what a user might want for 'Sport', 'Time', and 'Weekday'
input_data = pd.DataFrame({
    'Sport': ['Football'],
    'Evening?': [True],
    'Weekday?': [False]
})

### Getting the base dataset ready

In [11]:
#This function reads in our csv file and converts it to a dictionary
def csv_to_dicts(csv_file):
    df = pd.read_csv(csv_file, usecols=['Latitude', 'Longitude', 'geometry_wkt'])
    data_dicts = df.to_dict(orient='records')
    return data_dicts

#convert our 'Base' data to a dictionary
##Our base data is the baseline road dataset with 'SourceOID', 'Latitude', 'Longitude', 'geometry_wkt'
data_dicts = csv_to_dicts('ML_Base_data.csv')

#create an empty list to store our data
data_frames = []

#We are going to look at each row of data in the dictionary (each road segment)
for data_dict in data_dicts:
    #pull latitude, longitude, geometry
    latitude = data_dict['Latitude']
    longitude = data_dict['Longitude']
    wkt = data_dict['geometry_wkt']
    #create a copy of our input_data and then add in new rows for latitude, longitude, and geometry
    new_data = input_data.copy()
    new_data['Latitude'] = latitude
    new_data['Longitude'] = longitude
    new_data['wkt'] = wkt
    #add the new row of data to our list
    data_frames.append(new_data)

#turn all the rows in one dataframe
output_data = pd.concat(data_frames, ignore_index=True)

In [12]:
#function that uses our machine learning model to predict a new 'delta_cost'
def predict_delta_cost(input_data, encoder, model):
    #use OneHotEncoder to turn our prediction data into multiple categorical columns
    encoded_input = pd.DataFrame(encoder.transform(input_data[cat_columns]).toarray(), index=input_data.index)
    #pulls the categorical columns from when we trained our machine learning model
    encoded_input.columns = encoder.get_feature_names_out(cat_columns)
    
    #combine our OneHotEncoding with our latitude and longitude numerical columns
    input_features = pd.concat([input_data[['Latitude', 'Longitude']], encoded_input], axis=1)
    
    #Make predictions of delta cost
    predicted_delta_cost = model.predict(input_features)
    
    #add predicted delta cost to our original (not onehotencode) dataframe
    input_data['delta_cost'] = predicted_delta_cost

    return input_data

#load dataframe from above
input_data = output_data

#call the prediction function
output_data_delta = predict_delta_cost(input_data, encoder, model)
#output our predicted delta costs to a csv file
output_data_delta.to_csv('outputtestdelta2.csv', index=False)

## Add our predictions to the database

In [None]:
#import psycopg2 to work with database
import psycopg2

#define our connection stuff to database
dbname = 'gis5572'
user = 'postgres'
host = '35.188.97.184'
password = 'Passwordd'
table_name = 'delta_cost_map2'

#connect to database
conn = psycopg2.connect(dbname=dbname, user=user, host=host, password=password)
#create a cursor
cur = conn.cursor()

#drop the existing table if it exists
cur.execute(f"DROP TABLE IF EXISTS {table_name};")

#create a new table to store our data
cur.execute(f"""
CREATE TABLE IF NOT EXISTS {table_name} (
    id SERIAL PRIMARY KEY,
    delta_cost NUMERIC,
    geom GEOMETRY(LINESTRING, 4326)
);
""")

#commit changes to ensure the table is created
conn.commit()

#load our CSV file with data and delta values
df = pd.read_csv('outputtestdelta2.csv')

#count to know how many rows we are
count = 1
#loop and look at every row of data
for index, row in df.iterrows():
    print(count)
    count += 1
    #extract the wkt geometry and wkt values
    geom = row['wkt']
    delta_cost = row['delta_cost']
    id = count
    
    #insert data into the database, converting WKT to geometry
    cur.execute(f"INSERT INTO {table_name} (delta_cost, geom) VALUES (%s, ST_GeomFromText(%s, 4326))", (delta_cost, geom))

    #commit (tell database data is ready) every 5000 iterations 
    ##just so there is data being added to database continuously instead of all 52000 rows at once
    if count%5000 == 0:
        conn.commit()

#commit the last amount of data
conn.commit()

#close the cursor and connection to clean up
cur.close()
conn.close()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


# Add to app.py for displaying on arcgis online MAP

In [1]:
# Route to retrieve polygon as GeoJSON
@app.route('/delta_cost_map2')
def delta_cost_map():
    # Connect to the database
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD,
        host=DB_HOST,
        port=DB_PORT
    )

    # Create a cursor
    cur = conn.cursor()

    # Execute SQL query to retrieve the polygon
    cur.execute("""SELECT 
                    json_build_object(
                        'type', 'FeatureCollection',
                        'features', json_agg(
                            json_build_object(
                                'type', 'Feature',
                                'geometry', ST_AsGeoJSON(ST_SetSRID(geom, 4326))::json,
                                'properties', json_build_object(
                                    'delta_cost', delta_cost
                                )
                            )
                        ),
                        'crs', 
                        json_build_object(
                            'type', 'name',
                            'properties', 
                            json_build_object(
                                'name', 'EPSG:4326'
                            )
                        )
                    ) AS geojson
                FROM delta_cost_map2;
                """)
    rows = cur.fetchone()[0]

    # Close cursor and connection
    cur.close()
    conn.close()

    # Return the GeoJSON
    return rows

NameError: name 'app' is not defined

### User Selection (DOES NOT WORK)