In [1]:
!pip install meteostat tqdm scikit-learn

Collecting meteostat
  Downloading meteostat-1.6.8-py3-none-any.whl.metadata (4.6 kB)
Downloading meteostat-1.6.8-py3-none-any.whl (31 kB)
Installing collected packages: meteostat
Successfully installed meteostat-1.6.8


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

# Librerías principales
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from meteostat import Daily, Point
import requests
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tqdm import tqdm

# Configuración visual
sns.set_style("whitegrid")
sns.set_palette("Set2")

# Ruta de salida en tu Google Drive
output_folder = "/content/drive/MyDrive/NYC_Subway_Analysis"
os.makedirs(output_folder, exist_ok=True)

# 1. Obtener URLs de los 817 CSVs en GitHub
def get_csv_urls_from_github():
    api_url = 'https://api.github.com/repos/GMasso19/NYC_Subway/contents/raw_data'
    response = requests.get(api_url)
    if response.status_code == 200:
        contents = response.json()
        return [file['download_url'] for file in contents if file['name'].endswith('.csv')]
    else:
        print(f"Error API GitHub: {response.status_code}")
        return []

# 2. Cargar clima NYC (temperatura, lluvia, viento)
def get_weather_data():
    nyc = Point(40.7128, -74.0060)
    start = datetime(2023, 1, 1)
    end = datetime(2023, 12, 31)
    weather = Daily(nyc, start, end).fetch().reset_index()
    weather['date'] = weather['time'].dt.date
    return weather[['date', 'prcp', 'wspd']]

# 3. Unir todos los CSVs en un solo DataFrame
def load_all_data(csv_urls, weather_df):
    data = []
    for url in tqdm(csv_urls, desc="Processing CSV files"):
        try:
            df = pd.read_csv(url)
            df['transit_timestamp'] = pd.to_datetime(df['transit_timestamp'])
            df['date'] = df['transit_timestamp'].dt.date
            df['year'] = df['transit_timestamp'].dt.year
            df['month'] = df['transit_timestamp'].dt.month
            df['weekday'] = df['transit_timestamp'].dt.day_name()
            df['hour'] = df['transit_timestamp'].dt.hour
            df['fare_class_category'] = df['fare_class_category'].fillna('Unknown')
            df['payment_method'] = df['payment_method'].fillna('Unknown')
            df = df.merge(weather_df, on='date', how='left')
            data.append(df)
        except Exception as e:
            print("❌ Error in:", url, str(e))
    return pd.concat(data, ignore_index=True)

# 4. Crear gráficos y modelos con todo el dataset
def generate_graphs_and_models(df, output_folder="/content/drive/MyDrive/NYC_Subway_Analysis"):
    os.makedirs(output_folder, exist_ok=True)

    # Violin Plot
    plt.figure(figsize=(10, 5))
    sns.violinplot(x=df[df['ridership'] <= df['ridership'].quantile(0.95)]['ridership'], palette='Pastel1', inner='box')
    plt.title("Distribution of NYC Subway Ridership (up to 95th percentile)")
    plt.xlabel("Number of Subway Entries per Event")
    plt.figtext(0.5, -0.05, "Note: Outliers above 95th percentile are excluded for clarity.", ha="center")
    plt.tight_layout()
    plt.savefig(f"{output_folder}/violin_ridership_p95.png", dpi=300)
    plt.close()

    # Heatmap - Borough x Weekday
    pivot = df.groupby(['borough', 'weekday'])['ridership'].mean().unstack()
    ordered_days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    pivot = pivot[ordered_days]
    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot, annot=True, fmt=".0f", cmap='Blues')
    plt.title("Average Subway Ridership per Weekday by Borough")
    plt.xlabel("Day of Week")
    plt.ylabel("Borough")
    plt.tight_layout()
    plt.savefig(f"{output_folder}/heatmap_borough_weekday.png", dpi=300)
    plt.close()

    # Correlation Heatmap
    corr = df[['ridership', 'transfers', 'prcp', 'wspd']].corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5)
    plt.title("Correlation Matrix – Subway Usage & Weather")
    plt.tight_layout()
    plt.savefig(f"{output_folder}/correlation_matrix.png", dpi=300)
    corr.to_csv(f"{output_folder}/correlation_matrix.csv")
    plt.close()

    # Pairplot
    pair_df = df[['ridership', 'transfers', 'prcp', 'wspd']].dropna()
    sns.pairplot(pair_df, corner=True, diag_kind="kde", plot_kws={"alpha": 0.6, "s": 25, "edgecolor": "k"})
    plt.savefig(f"{output_folder}/pairplot_ridership_weather.png", dpi=300)
    plt.close()

    # Linear Regression (con regresión más visual)
    model_df = df.dropna(subset=['ridership', 'prcp', 'wspd'])
    X = model_df[['prcp', 'wspd']]
    y = model_df['ridership']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression().fit(X_train, y_train)
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    plt.figure(figsize=(6, 6))
    plt.scatter(y_test, preds, alpha=0.5)
    m, b = np.polyfit(y_test, preds, 1)
    plt.plot(y_test, m*y_test + b, color="red")
    plt.title(f"Linear Regression – R²: {r2:.2f}")
    plt.xlabel("Actual Ridership")
    plt.ylabel("Predicted Ridership")
    plt.tight_layout()
    plt.savefig(f"{output_folder}/linear_regression.png", dpi=300)
    plt.close()

    # Decision Tree (mejorado)
    cat_features = ['borough', 'fare_class_category']
    num_features = ['prcp', 'wspd']
    df_tree = df.dropna(subset=['ridership'] + num_features + cat_features)
    X = df_tree[cat_features + num_features]
    y = df_tree['ridership']
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('num', StandardScaler(), num_features)
    ])
    tree_pipeline = Pipeline([
        ('pre', preprocessor),
        ('tree', DecisionTreeRegressor(max_depth=10))
    ])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    tree_pipeline.fit(X_train, y_train)
    preds_tree = tree_pipeline.predict(X_test)
    r2_tree = r2_score(y_test, preds_tree)
    plt.figure(figsize=(6, 6))
    sns.scatterplot(x=y_test, y=preds_tree)
    plt.title(f"Decision Tree – R²: {r2_tree:.2f}")
    plt.xlabel("Actual Ridership")
    plt.ylabel("Predicted Ridership")
    plt.tight_layout()
    plt.savefig(f"{output_folder}/decision_tree_prediction.png", dpi=300)
    plt.close()


# 5. Ejecutar todo
def main():
    csv_urls = get_csv_urls_from_github()
    weather_data = get_weather_data()
    full_data = load_all_data(csv_urls, weather_data)
    generate_graphs_and_models(full_data)

if __name__ == '__main__':
    main()

print("✅ Analysis complete. All outputs saved in your Google Drive folder.")



Processing CSV files:   8%|▊         | 63/818 [00:24<04:51,  2.59it/s]


KeyboardInterrupt: 