Build a regression model.

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Path to the CSV files
merged_yelp_csv_path = "merged_bike_stations_with_yelp.csv"
merged_foursquare_csv_path = "merged_bike_stations_with_foursquare.csv"

# Read CSV files into DataFrames
bike_stations_with_yelp = pd.read_csv(merged_yelp_csv_path)
bike_stations_with_foursquare = pd.read_csv(merged_foursquare_csv_path)

# Merge the Yelp and Foursquare DataFrames
merged_data = bike_stations_with_yelp.merge(bike_stations_with_foursquare, how='left', on=['station_latitude', 'station_longitude'])

# Define the features and target
yelp_rating_column = 'rating_x_x'  # Yelp rating
foursquare_rating_column = 'rating_x_y'  # Foursquare rating
features = [foursquare_rating_column]
target = yelp_rating_column  # Predicting Yelp rating based on Foursquare rating

# Drop rows with missing values in the target or feature columns
merged_data.dropna(subset=[target, features[0]], inplace=True)

# Split the data into training and testing sets
X = merged_data[features]
y = merged_data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 0.0013461288155181368
R-squared: 0.9932640934954526


Provide model output and an interpretation of the results. 

In [15]:
# Model results interpretation
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("\nThe model exhibits a high level of accuracy and goodness of fit:")
print("Mean Squared Error (MSE) is extremely low, indicating accurate predictions.")
print("R-squared value of approximately 0.993 indicates that around 99.3% of the variability in Yelp ratings can be explained by Foursquare ratings.")
print("The strong performance suggests a strong relationship between Foursquare ratings and Yelp ratings in this context.")
print("However, further investigation is recommended to ensure data quality and avoid potential sources of data leakage.")


Mean Squared Error: 0.0013461288155181368
R-squared: 0.9932640934954526

The model exhibits a high level of accuracy and goodness of fit:
Mean Squared Error (MSE) is extremely low, indicating accurate predictions.
R-squared value of approximately 0.993 indicates that around 99.3% of the variability in Yelp ratings can be explained by Foursquare ratings.
The strong performance suggests a strong relationship between Foursquare ratings and Yelp ratings in this context.
However, further investigation is recommended to ensure data quality and avoid potential sources of data leakage.


# Stretch

How can you turn the regression model into a classification model?

In [23]:
print("Number of classes in the target variable:", len(y_train.unique()))
if len(y_train.unique()) < 2:
    print("Cannot create a classification model with only one class.")
    print("At least two classes are required for classification.")


Number of classes in the target variable: 1
Cannot create a classification model with only one class.
At least two classes are required for classification.
