In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import os
import matplotlib.pyplot as plt
import seaborn as sns


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Load and preprocess data
print(os.getcwd())
df = pd.read_csv('../processed/'+sorted(os.listdir('../processed'))[-1])

# Basic preprocessing
df.drop(columns=['character2'], inplace=True)
df = df[df['psa_grade'] <= 10]

# Create processed dataframe
df_processed = df.copy()


In [None]:
# Feature engineering and encoding
# Convert date columns to datetime
df_processed['scrape_date'] = pd.to_datetime(df_processed['scrape_date'])
df_processed['sold_date'] = pd.to_datetime(df_processed['sold_date'])

# Add new features
df_processed['days_listed'] = (df_processed['sold_date'] - df_processed['scrape_date']).dt.days
df_processed['total_cost'] = df_processed['price'] + df_processed['shipping']

# Encode categorical variables
le_condition = LabelEncoder()
le_character = LabelEncoder()
df_processed['condition'] = le_condition.fit_transform(df_processed['condition'])
df_processed['character'] = le_character.fit_transform(df_processed['character'])


In [None]:
# Prepare features for modeling
features = ['shipping', 'condition', 'character', 'psa_grade', 'days_listed']
X = df_processed[features]
y = df_processed['price']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)


In [None]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
# Define evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Training metrics
    train_pred = model.predict(X_train)
    train_r2 = r2_score(y_train, train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    train_mae = mean_absolute_error(y_train, train_pred)
    
    # Testing metrics
    test_pred = model.predict(X_test)
    test_r2 = r2_score(y_test, test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    test_mae = mean_absolute_error(y_test, test_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=5)
    
    print("Training Metrics:")
    print(f"R² Score: {train_r2:.4f}")
    print(f"RMSE: ${train_rmse:.2f}")
    print(f"MAE: ${train_mae:.2f}\n")
    
    print("Testing Metrics:")
    print(f"R² Score: {test_r2:.4f}")
    print(f"RMSE: ${test_rmse:.2f}")
    print(f"MAE: ${test_mae:.2f}\n")
    
    print("Cross-validation Scores:")
    print(f"Mean R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    return test_pred


In [None]:
# Define feature importance analysis function
def analyze_features(model, features):
    coef_df = pd.DataFrame({
        'Feature': features,
        'Coefficient': model.coef_,
        'Abs_Coefficient': abs(model.coef_)
    })
    coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)
    
    print("\nFeature Importance:")
    print(coef_df)
    return coef_df


In [None]:
# Define prediction function
def predict_price(model, scaler, new_data, le_condition, le_character):
    # Prepare new data
    new_data_encoded = new_data.copy()
    new_data_encoded['condition'] = le_condition.transform([new_data['condition']])[0]
    new_data_encoded['character'] = le_character.transform([new_data['character']])[0]
    
    # Scale features
    features_scaled = scaler.transform(pd.DataFrame([new_data_encoded]))
    
    # Make prediction
    predicted_price = model.predict(features_scaled)[0]
    return predicted_price


In [None]:
# Evaluate model
test_predictions = evaluate_model(model, X_train, X_test, y_train, y_test)

# Analyze feature importance
coef_df = analyze_features(model, features)


In [None]:
# Visualize predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, test_predictions, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Predicted vs Actual Prices')
plt.tight_layout()
plt.show()


In [None]:
# Example prediction
example_card = {
    'shipping': 0,
    'condition': 'new (other)',
    'character': 'mew',
    'psa_grade': 9,
    'days_listed': 0
}

predicted_price = predict_price(model, scaler, example_card, le_condition, le_character)
print(f"\nExample Prediction:")
print(f"Predicted price for a PSA 9 {example_card['character']} card in {example_card['condition']} condition: ${predicted_price:.2f}")


In [None]:
# Additional analysis: Feature correlations
correlation_matrix = df_processed[features + ['price']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.tight_layout()
plt.show()