# Museum Attendance Linear Regression

This notebook analyzes the correlation between museum visitor counts and city population.

## Import Required Libraries

In [None]:
from museum_attendance_common.repository import MuseumRepository
from museum_attendance_common.config.database import get_db_session
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Load Dataset from Database

## (Optional) Run Data Extraction

Run this cell if you want to fetch fresh museum data from Wikipedia. This will populate/update the database with latest data.

**Note:** This process can take several minutes depending on network speed and API rate limits.

In [None]:
# Uncomment the lines below to run data extraction
# This will fetch museum data from Wikipedia and populate the database

# import sys
# sys.path.insert(0, '/app/museum-attendance-data-fetcher/src')
# from museum_attendance_data_fetcher import export
# # 
# print("Starting data extraction... This may take several minutes.")
# export()
# print("Data extraction completed!")

## Load Dataset from Database

In [None]:
with get_db_session() as session:
    museum_repo = MuseumRepository(session)
    museums = museum_repo.get_museums()

    print(f"Total museums in database: {len(museums)}")
    
    # Build list of dict including museum characteristics
    museum_attendance_and_city_population_dict = []
    for museum in museums:
        # Base museum info
        museum_dict = {
            "museum_name": museum.name,
            "visitor_count": museum.number_of_visitors,
            "city_name": museum.city.name,
            "city_population": museum.city.population,
        }
        
        museum_attendance_and_city_population_dict.append(museum_dict)

df = pd.DataFrame(museum_attendance_and_city_population_dict)

# Remove rows with N/A values
df_clean = df[(df['visitor_count'].notna()) & (df['city_population'].notna())]

print(f"Dataset size: {len(df_clean)} museums")
print(f"\nSample data:")
print(df_clean.head())

## Prepare Data for Linear Regression

In [None]:
# Prepare datasets for linear regression
X = df_clean[['visitor_count']].values  # Independent variable (features)
y = df_clean['city_population'].values   # Dependent variable (target)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

## Train Linear Regression Model

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

## Evaluate Model Performance

In [None]:
# Make predictions
predictions = model.predict(X_test)

# Output model details and sample predictions
print(f"Model Coefficient: {model.coef_[0]}")
print(f"Model Intercept: {model.intercept_}")
print(f"\nSample Predictions vs Actual:")
for pred, actual in zip(predictions[:10], y_test[:10]):
    print(f"Predicted: {pred:.2f}, Actual: {actual}")

print(f"\nModel Score (R^2): {model.score(X_test, y_test):.4f}")

## Visualize Regression Line

In [None]:
# Visualize the regression line
plt.figure(figsize=(10, 6))

# Plot all data points (training + test)
plt.scatter(X, y, alpha=0.5, label='Actual Data')

# Plot regression line
X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_pred_range = model.predict(X_range)
plt.plot(X_range, y_pred_range, color='red', linewidth=2, label='Regression Line')

plt.xlabel('Museum Visitor Count')
plt.ylabel('City Population')
plt.title(f'Linear Regression: Museum Visitors vs City Population\n(RÂ² = {model.score(X_test, y_test):.4f})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()