In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

# Load the data from the CSV file into a DataFrame
data = pd.read_csv('AllTrafficCost.csv')

# Filter out rows where 'Sport' column equals 'line'
data_filtered = data[data['Sport'] != 'line']

# Select the columns of interest (replace 'categorical_column1', 'categorical_column2', 'categorical_column3' with actual column names)
categorical_columns = ['Sport', 'Timestamp', 'Weekday?']

# Perform one-hot encoding for all categorical columns
encoder = OneHotEncoder(sparse=False)
encoded_columns = pd.DataFrame(encoder.fit_transform(data_filtered[categorical_columns]))
encoded_columns.columns = encoder.get_feature_names_out(categorical_columns)

# Concatenate the encoded columns
X = pd.concat([encoded_columns], axis=1)

# Assuming 'target_column' is the column you want to predict
y = data_filtered['delta_cost']

# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

# Print the coefficients and intercept of the model
print("\nModel Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(feature, ":", coef)
print("Intercept:", model.intercept_)




Mean Squared Error: 0.03226745377564718
R-squared Score: 0.00034299292537998127

Model Coefficients:
Sport_Basketball : 6408641.805154863
Sport_Football : 6408641.804893646
Sport_Hockey : 6408641.802995452
Sport_Volleyball : 6408641.805464947
Timestamp_010000PM : 621406.8880968078
Timestamp_011500PM : 621406.886446376
Timestamp_013000PM : 621406.886868063
Timestamp_020000PM : 621406.8864738145
Timestamp_023000PM : 621406.887361293
Timestamp_030000PM : 621406.8866798241
Timestamp_033000PM : 621406.885244402
Timestamp_040000PM : 621406.8866441891
Timestamp_041500PM : 621406.8859312574
Timestamp_043000PM : 621406.885946748
Timestamp_050000PM : 621406.8865671911
Timestamp_053000PM : 621406.8833405753
Timestamp_060000PM : 621406.8868700318
Timestamp_063000PM : 621406.8846934661
Timestamp_070000PM : 621406.8860628712
Timestamp_071500PM : 621406.8846064492
Timestamp_073000PM : 621406.8852702869
Timestamp_080000PM : 621406.8848394391
Timestamp_081500PM : 621406.8835888648
Timestamp_083000PM : 

In [13]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

# Load the data from the CSV file into a DataFrame
data = pd.read_csv('AllTrafficCost.csv')

# Filter out rows where 'Sport' column equals 'line'
data_filtered = data[data['Sport'] != 'line']

# Select the columns of interest (replace 'categorical_column1', 'categorical_column2', 'categorical_column3' with actual column names)
categorical_columns = ['Sport', 'Timestamp', 'Weekday?']

# Perform one-hot encoding for all categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_columns = pd.DataFrame(encoder.fit_transform(data_filtered[categorical_columns]))
encoded_columns.columns = encoder.get_feature_names_out(categorical_columns)

# Concatenate the encoded columns
X = pd.concat([encoded_columns], axis=1)

# Assuming 'target_column' is the column you want to predict
y = data_filtered['delta_cost']

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the full data
model.fit(X, y)

# Now let's say you have new data in the form of a DataFrame called 'new_data'
# You should preprocess this new data in the same way as the original data

# For example:
new_data = pd.DataFrame({
    'Sport': ['Football'],  # Example new categorical value
    'Timestamp': ['070000PM'],  # Example new timestamp value
    'Weekday?': ['False']  # Example new weekday value
})

# Perform one-hot encoding for the new categorical columns
encoded_new_data = pd.DataFrame(encoder.transform(new_data[categorical_columns]))

# Make sure the columns are in the same order as the original data
encoded_new_data.columns = encoded_columns.columns

# Now you can use the trained model to make predictions on the new data
new_predictions = model.predict(encoded_new_data)

print("Predicted delta_cost for the new data:", new_predictions)




Predicted delta_cost for the new data: [-2332386.23347873]
